def __del_server(self, server_to_del): # backup data redis = redis_mgmt.Redis( server_to_del['id'] ) ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) # bgsave ret = util.bgsave(server_to_del) self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id']) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check if pgs is removed success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( server_to_del['id'] ) ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'succeeded : pgs is removed' ) # change state of pgs to lconn cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # shutdown ret = testbase.request_to_shutdown_smr( server_to_del ) self.assertEqual( ret, 0, 'failed : shutdown smr. id:%d' % server_to_del['id'] ) ret = testbase.request_to_shutdown_redis( server_to_del ) self.assertEquals( ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id'] ) util.log('succeeded : shutdown pgs%d.' % server_to_del['id'] ) # delete pgs from cluster cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
def pgs_add_and_del( self, upgrade_server, type ): util.print_frame() util.log( '[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type) ) util.log_server_state( self.cluster ) # start load generator load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 50 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) # attach pgs from cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) time.sleep( 3 ) # check new values redis = redis_mgmt.Redis( upgrade_server['id'] ) ret = redis.connect( upgrade_server['ip'], upgrade_server['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port']) ) for i in range( 0, 50 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i) ) util.log( 'succeeded : check values with get operations on pgs%d.' % (upgrade_server['id']) ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state( self.cluster ) return 0
def test_7_remaining_hbc_connection( self ): util.print_frame() # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis(server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr(server['smr_mgmt_port']) cmd = 'pgs_leave %s %d forced\r\npgs_del %s %d' % (self.cluster['cluster_name'], server['id'], self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis(server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) # check smr success = False expected = 0 for i in range(5): after_cnt = util.get_clients_count_of_smr(server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) ) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) )
def test_7_remaining_hbc_connection( self ): util.print_frame() # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis(server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr(server['smr_mgmt_port']) cmd = 'pgs_leave %s %d\r\npgs_del %s %d' % (self.cluster['cluster_name'], server['id'], self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis(server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) # check smr success = False expected = 0 for i in range(5): after_cnt = util.get_clients_count_of_smr(server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) ) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) )
def request_to_start_gateway(cluster_name, server, leader_cm, check_state=True): # add gateway configuration to confmaster cmd = 'gw_add %s %d %s %s %d' % (cluster_name, server['id'], server['pm_name'], server['ip'], server['gateway_port']) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 # start gateway process ret = util.start_gateway(server['id'], server['ip'], leader_cm['cm_port'], cluster_name, server['gateway_port']) if ret is not 0: util.log('failed to start_gateway. server:%s, id:%d' % (server['ip'], server['id'])) return -1 # check gateway state if check_state: ok = False try_cnt = 0 while try_cnt < 5: try_cnt += 1 if util.check_gateway_state(cluster_name, leader_cm, server): ok = True break time.sleep(0.5) if not ok: util.log('failed to start_gateway, Invalid state of gateway.' % inactive_conns) return -1 # check inactive redis connections of gateway if check_state: ok = False try_cnt = 0 while try_cnt < 10: try_cnt += 1 if util.gw_info_redis_disccons(server['ip'], server['gateway_port']) == 0: ok = True break else: time.sleep(0.5) if not ok: util.log( 'failed to start_gateway, invalid number of inactive redis connections. %d' % inactive_conns) return -1 util.log('succeeded to start_gateway. server:%s, id:%d' % (server['ip'], server['id'])) return 0
def request_to_shutdown_gateway( cluster_name, server, leader_cm, check=False ): ip = server['ip'] port = server['gateway_port'] id = server['id'] # delete gateway configuration from confmaster cmd = 'gw_del %s %d' % (cluster_name, id) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 # check client connection ok = False for i in range(10): client_conn_num = util.gw_info_client_cnt(ip, port) if client_conn_num == 1: ok = True break else: time.sleep(1) if ok == False: util.log('failed to shutdown_gateway, invalid number of client connections. %d' % (client_conn_num - 1)) return -1 # shutdown gateway process if util.shutdown_gateway( id, port ) is not 0: util.log('failed to shutdown_gateway %d' % (id)) return -1 util.log('succeeded to shutdown_gateway. %d' % (id)) return 0
def request_to_shutdown_gateway(cluster_name, server, leader_cm, check=False): ip = server['ip'] port = server['gateway_port'] id = server['id'] # delete gateway configuration from confmaster cmd = 'gw_del %s %d' % (cluster_name, id) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 # check client connection ok = False for i in range(10): client_conn_num = util.gw_info_client_cnt(ip, port) if client_conn_num == 1: ok = True break else: time.sleep(1) if ok == False: util.log( 'failed to shutdown_gateway, invalid number of client connections. %d' % (client_conn_num - 1)) return -1 # shutdown gateway process if util.shutdown_gateway(id, port) is not 0: util.log('failed to shutdown_gateway %d' % (id)) return -1 util.log('succeeded to shutdown_gateway. %d' % (id)) return 0
def add_physical_machine_to_mgmt(mgmt_ip, mgmt_port, pm_name, pm_ip): cmd = 'pm_add %s %s' % (pm_name, pm_ip) result = util.cm_command(mgmt_ip, mgmt_port, cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 return 0
def fi_count(fi, ip, port): cmd = ('fi_count %s %s' % (fi[0], fi[1])).lower() reply = util.cm_command(ip, port, cmd) try: return int(reply) except ValueError as e: util.log("fi_count fail. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) return -1
def finalize_info_of_cm_about_pgs( cluster, server, leader_cm ): cmd = 'pgs_leave %s %d' % (cluster['cluster_name'], server['id']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 time.sleep( 3 ) cmd = 'pgs_del %s %d' % (cluster['cluster_name'], server['id']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 time.sleep( 3 ) return 0
def finalize_info_of_cm_about_pgs(cluster, server, leader_cm): cmd = 'pgs_leave %s %d forced' % (cluster['cluster_name'], server['id']) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 time.sleep(3) cmd = 'pgs_del %s %d' % (cluster['cluster_name'], server['id']) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 time.sleep(3) return 0
def add_physical_machine_to_mgmt( mgmt_ip, mgmt_port, pm_name, pm_ip ): cmd = 'pm_add %s %s' % (pm_name, pm_ip) result = util.cm_command( mgmt_ip, mgmt_port, cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 return 0
def fi_add(fi, count, ip, port): cmd = ('fi_add %s %s %s %d' % (fi[0], fi[1], fi[2], count)).lower() reply = util.cm_command(ip, port, cmd) ret = json.loads(reply) state = ret['state'] if 'success' == state: return True else: util.log("fi_add fail. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) return False
def classify_cm(servers): cm_leaders = [] cm_followers = [] for s in servers: ret = json.loads(util.cm_command('0.0.0.0', s['cm_port'], 'cluster_ls'), encoding='ascii')['state'] if ret == 'success': cm_leaders.append({'ip':'0.0.0.0', 'port':s['cm_port']}) elif ret == 'redirect': cm_followers.append({'ip':'0.0.0.0', 'port':s['cm_port']}) return {'leader':cm_leaders, 'follower':cm_followers}
def initialize_cluster(cluster, leader_cm=None): if leader_cm == None: leader_cm = cluster['servers'][0] servers = cluster['servers'] initialize_physical_machine_znodes(leader_cm['ip'], leader_cm['cm_port'], config.machines) cmd = 'cluster_add %s %s' % (cluster['cluster_name'], cluster['quorum_policy']) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 slot_no = 0 for pg_id in cluster['pg_id_list']: cmd = 'pg_add %s %d' % (cluster['cluster_name'], pg_id) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 if cluster['slots'][slot_no] != -1: cmd = 'slot_set_pg %s %d:%d %d' % ( cluster['cluster_name'], cluster['slots'][slot_no], cluster['slots'][slot_no + 1], pg_id) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 slot_no = slot_no + 2 for server in servers: initialize_info_of_cm_about_pgs(cluster, server, leader_cm) return 0
def classify_cm(servers): cm_leaders = [] cm_followers = [] for s in servers: ret = json.loads(util.cm_command('0.0.0.0', s['cm_port'], 'cluster_ls'), encoding='ascii')['state'] if ret == 'success': cm_leaders.append({'ip': '0.0.0.0', 'port': s['cm_port']}) elif ret == 'redirect': cm_followers.append({'ip': '0.0.0.0', 'port': s['cm_port']}) return {'leader': cm_leaders, 'follower': cm_followers}
def initialize_info_of_cm_about_pgs( cluster, server, leader_cm, pg_id=None ): if pg_id == None: pg_id = server['pg_id'] cmd = 'pgs_add %s %d %d %s %s %d %d' % (cluster['cluster_name'], server['id'], pg_id, server['pm_name'], server['ip'], server['smr_base_port'], server['redis_port']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 cmd = 'pgs_join %s %d' % (cluster['cluster_name'], server['id']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) try: jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 except json.ValueError: util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 return 0
def initialize_cluster( cluster, leader_cm=None ): if leader_cm == None: leader_cm = cluster['servers'][0] servers = cluster['servers'] initialize_physical_machine_znodes( leader_cm['ip'], leader_cm['cm_port'], config.machines ) cmd = 'cluster_add %s %s' % (cluster['cluster_name'], cluster['quorum_policy']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 slot_no = 0 for pg_id in cluster['pg_id_list']: cmd = 'pg_add %s %d' % (cluster['cluster_name'], pg_id) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 if cluster['slots'][slot_no] != -1: cmd = 'slot_set_pg %s %d:%d %d' % (cluster['cluster_name'], cluster['slots'][slot_no], cluster['slots'][slot_no+1], pg_id) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 slot_no = slot_no + 2 for server in servers: initialize_info_of_cm_about_pgs( cluster, server, leader_cm ) return 0
def initialize_physical_machine_znodes( ip, port, pm_list ): for pm in pm_list: if pm['type'] == 'virtual': pm_ip = pm['virtual_ip'] else: pm_ip = pm['ip'] cmd = 'pm_add %s %s' % (pm['name'], pm_ip) util.log('ip:%s, port:%d, cmd:%s' % (ip, port, cmd)) result = util.cm_command( ip, port, cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 return 0
def initialize_physical_machine_znodes(ip, port, pm_list): for pm in pm_list: if pm['type'] == 'virtual': pm_ip = pm['virtual_ip'] else: pm_ip = pm['ip'] cmd = 'pm_add %s %s' % (pm['name'], pm_ip) util.log('ip:%s, port:%d, cmd:%s' % (ip, port, cmd)) result = util.cm_command(ip, port, cmd) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 return 0
def request_to_start_gateway( cluster_name, server, leader_cm, check_state=True ): # add gateway configuration to confmaster cmd = 'gw_add %s %d %s %s %d' % (cluster_name, server['id'], server['pm_name'], server['ip'], server['gateway_port']) result = util.cm_command( leader_cm['ip'], leader_cm['cm_port'], cmd ) jobj = json.loads(result) if jobj['state'] != 'success': util.log('failed to execute. cmd:%s, result:%s' % (cmd, result)) return -1 # start gateway process ret = util.start_gateway( server['id'], server['ip'], leader_cm['cm_port'], cluster_name, server['gateway_port'] ) if ret is not 0: util.log('failed to start_gateway. server:%s, id:%d' % (server['ip'], server['id'])) return -1 # check gateway state if check_state: ok = False try_cnt = 0 while try_cnt < 5: try_cnt += 1 if util.check_gateway_state( cluster_name, leader_cm, server ): ok = True break time.sleep(0.5) if not ok: util.log('failed to start_gateway, Invalid state of gateway.' % inactive_conns) return -1 # check inactive redis connections of gateway if check_state: ok = False try_cnt = 0 while try_cnt < 10: try_cnt += 1 if util.gw_info_redis_disccons(server['ip'], server['gateway_port']) == 0: ok = True break else: time.sleep(0.5) if not ok: util.log('failed to start_gateway, invalid number of inactive redis connections. %d' % inactive_conns) return -1 util.log('succeeded to start_gateway. server:%s, id:%d' % (server['ip'], server['id'])) return 0
def finalize_cluster(cluster, leader_cm=None): if leader_cm == None: leader_cm = cluster['servers'][0] servers = cluster['servers'] # PGS for server in servers: if finalize_info_of_cm_about_pgs(cluster, server,leader_cm ) != 0: return -1 # PG for pg_id in cluster['pg_id_list']: if util.cm_success(util.cm_command(leader_cm['ip'], leader_cm['cm_port'], 'pg_del %s %d' % (cluster['cluster_name'], pg_id)))[0] == False: return -1 # Cluster if util.cluster_del(leader_cm['ip'], leader_cm['cm_port'], cluster['cluster_name']) == False: return -1 return 0
def finalize_cluster(cluster, leader_cm=None): if leader_cm == None: leader_cm = cluster['servers'][0] servers = cluster['servers'] # PGS for server in servers: if finalize_info_of_cm_about_pgs(cluster, server, leader_cm) != 0: return -1 # PG for pg_id in cluster['pg_id_list']: if util.cm_success( util.cm_command( leader_cm['ip'], leader_cm['cm_port'], 'pg_del %s %d' % (cluster['cluster_name'], pg_id)))[0] == False: return -1 # Cluster if util.cluster_del(leader_cm['ip'], leader_cm['cm_port'], cluster['cluster_name']) == False: return -1 return 0
def test_gateway_add_del(self): util.print_frame() api = ARC_API(ZK_ADDR, CLUSTER_NAME, logFilePrefix = self.arcci_log, so_path = self.so_path) # Add gateway gw_port = 10000 gw_id = 10 cmd = 'gw_add %s %d %s %s %d' % (CLUSTER_NAME, gw_id, HOST_NAME, HOST_IP, gw_port) ret = util.cm_command(MGMT_IP, MGMT_PORT, cmd) if ret != None and len(ret) >= 2: ret = ret[:-2] util.log('cmd:"%s", ret:"%s"' % (cmd, ret)) if not ret.startswith('{"state":"success","msg":"+OK"}'): self.fail('failed to add gateway') # Deploy gateway server = self.cluster['servers'][0] ret = util.deploy_gateway(gw_id) self.assertTrue(ret, 'failed to deploy_gateway') # Start gateway ret = util.start_gateway( gw_id, server['ip'], MGMT_PORT, server['cluster_name'], gw_port) self.assertEqual( ret, 0, 'failed : start gateawy%d' % gw_id ) time.sleep(5) # Check if gateway is added added_gw = {"ip":HOST_IP,"port":gw_port} log_reader = LogReader(api.conf.log_file_prefix) found = False while True: line = log_reader.readline() if line == None: break if line.find(MSG_GATEWAY_ADD_ZK) == -1: continue gw = line.split('data:')[1] gw = ast.literal_eval(gw) if gw['ip'] == added_gw['ip'] and gw['port'] == added_gw['port']: found = True if not found: self.fail('FAIL, load gateway information, gw:%s' % util.json_to_str(added_gw)) else: util.log('SUCCESS, load gateway information.') # Delete gateway cmd = 'gw_del %s %d' % (CLUSTER_NAME, gw_id) ret = util.cm_command(MGMT_IP, MGMT_PORT, cmd) if ret != None and len(ret) >= 2: ret = ret[:-2] util.log('cmd:"%s", ret:"%s"' % (cmd, ret)) if not ret.startswith('{"state":"success","msg":"+OK"}'): self.fail('failed to delete gateway') # Check if gateway is deleted deleted_gw = {"ip":HOST_IP,"port":gw_port} found = check_gateway_deleted(deleted_gw, api) if not found: self.fail('FAIL, delete gateway information, gw:%s' % util.json_to_str(deleted_gw)) else: util.log('SUCCESS, delete gateway information.') # Stop gateway ret = util.shutdown_gateway(gw_id, gw_port) self.assertEqual(ret, 0, 'failed : shutdown gateawy%d' % gw_id) api.destroy()
def test_3_heartbeat_target_connection_count( self ): util.print_frame() util.log( 'wait until all connections are established' ) for i in range(1, 8): time.sleep(1) util.log( '%d sec' % i ) # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis(server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr(server['smr_mgmt_port']) cmd = 'pgs_leave %s %d forced' % (self.cluster['cluster_name'], server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['state'], 'success', 'failed : cmd="%s", reply="%s"' % (cmd, ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd, ret[:-2]) ) # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis(server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected:n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) # check smr success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_smr(server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['state'], 'success', 'failed : cmd="%s", reply="%s"' % (cmd, ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd, ret[:-2]) ) success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) ) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) )
def test_random_migrate(self): util.print_frame() # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') leader_cm = self.cluster['servers'][0] cluster_name = self.cluster['cluster_name'] mapping = [-1] * 8192 count = 50 while count > 0: # get PN -> PG map cmd = 'cluster_info %s' % cluster_name result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) ret = json.loads(result) rle = ret['data']['cluster_info']['PN_PG_Map'] print "PN_PG_MAP = %s" % rle sp = rle.split() index = 0 for i in range(len(sp) / 2): for j in range(int(sp[i * 2 + 1])): mapping[index] = int(sp[i * 2]) index += 1 slot = random.randint(0, 8191) src_pgid = mapping[slot] dst_pgid = (src_pgid + 1) % 2 slot_end = slot while random.randint(0, 5) <= 4: if slot_end < 8191 and mapping[slot_end + 1] == src_pgid: slot_end += 1 else: break print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid, dst_pgid) ret = util.migration(self.cluster, src_pgid, dst_pgid, slot, slot_end, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break count -= 1 # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # Go back to initial configuration cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'], cluster_name) for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1): self.assertTrue( util.migration(self.cluster, 1, 0, slot['begin'], slot['end'], 40000), 'failed to rollback migration')
def test_3_heartbeat_target_connection_count(self): util.print_frame() util.log('wait until all connections are established') for i in range(1, 8): time.sleep(1) util.log('%d sec' % i) # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis( server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr( server['smr_mgmt_port']) cmd = 'pgs_leave %s %d forced' % (self.cluster['cluster_name'], server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['state'], 'success', 'failed : cmd="%s", reply="%s"' % (cmd, ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd, ret[:-2])) # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis( server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected:n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis)) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis)) # check smr success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_smr( server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr)) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr)) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['state'], 'success', 'failed : cmd="%s", reply="%s"' % (cmd, ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd, ret[:-2])) success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw( server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected)) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected))
def test_7_remaining_hbc_connection(self): util.print_frame() # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis( server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr( server['smr_mgmt_port']) cmd = 'pgs_leave %s %d forced\r\npgs_del %s %d' % ( self.cluster['cluster_name'], server['id'], self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis( server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis)) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis)) # check smr success = False expected = 0 for i in range(5): after_cnt = util.get_clients_count_of_smr( server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr)) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr)) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw( server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected)) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected)) # Go back to initial configuration # Cleanup PG self.assertTrue( util.cm_success( util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], 'pg_del %s %d' % (self.cluster['cluster_name'], self.cluster['servers'][0]['pg_id'])))[0]) # Cleanup processes of PGS and GW for s in self.cluster['servers']: self.assertEqual(0, util.shutdown_redis(s['id'], s['redis_port']), 'failed to kill redis %d process' % s['id']) self.assertEqual( 0, util.shutdown_smr(s['id'], s['ip'], s['smr_base_port']), 'failed to kill smr %d process' % s['id']) self.assertEqual(0, util.shutdown_gateway(s['id'], s['gateway_port']), 'failed to kill gw %d process' % s['id']) # Recover PG self.assertTrue( util.install_pg(self.cluster, self.cluster['servers'], self.cluster['servers'][0], start_gw=True), 'failed to recover PGS and GW in a PM')
def test_random_migrate(self): util.print_frame() # start load generator load_gen_thrd_list = {} util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() ret = util.migration(self.cluster, 0, 1, 4096, 8191, 40000) self.assertEqual(True, ret, 'Migration Fail') leader_cm = self.cluster['servers'][0] cluster_name = self.cluster['cluster_name'] mapping = [-1] * 8192 count = 50 while count > 0: # get PN -> PG map cmd = 'cluster_info %s' % cluster_name result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) ret = json.loads(result) rle = ret['data']['cluster_info']['PN_PG_Map'] print "PN_PG_MAP = %s" % rle sp = rle.split() index = 0 for i in range(len(sp)/2): for j in range(int(sp[i*2+1])): mapping[index] = int(sp[i*2]) index += 1 slot = random.randint(0, 8191) src_pgid = mapping[slot] dst_pgid = (src_pgid+1) % 2 slot_end = slot while random.randint(0,5) <= 4: if slot_end < 8191 and mapping[slot_end+1] == src_pgid: slot_end += 1 else: break print "SLOT=%d, SRC_PGID=%d, DST_PGID=%d" % (slot, src_pgid, dst_pgid) ret = util.migration(self.cluster, src_pgid, dst_pgid, slot, slot_end, 40000) self.assertEqual(True, ret, 'Migration Fail') ok = True for j in range(len(load_gen_thrd_list)): if load_gen_thrd_list[j].isConsistent() == False: ok = False break if not ok: break; count -= 1; # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # Go back to initial configuration cinfo = util.cluster_info(leader_cm['ip'], leader_cm['cm_port'], cluster_name) for slot in util.get_slots(cinfo['cluster_info']['PN_PG_Map'], 1): self.assertTrue(util.migration(self.cluster, 1, 0, slot['begin'], slot['end'], 40000), 'failed to rollback migration')
def __del_server(self, server_to_del): # backup data redis = redis_mgmt.Redis(server_to_del['id']) ret = redis.connect(server_to_del['ip'], server_to_del['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) # bgsave ret = util.bgsave(server_to_del) self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id']) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) r = util.get_role_of_server(server_to_del) # If quorum of left master is larger than 1, info command will be blocked. if r != c.ROLE_MASTER: # check if pgs is removed success = False for try_cnt in range(10): redis = redis_mgmt.Redis(server_to_del['id']) ret = redis.connect(server_to_del['ip'], server_to_del['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) util.log('succeeded : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) redis.write('info stats\r\n') for i in range(6): redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'], res[:-2])) no = int(res.split(':')[1]) if no <= 100: success = True break time.sleep(1) self.assertEquals(success, True, 'failed : pgs does not removed.') util.log('pgs is removed') # change state of pgs to lconn cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # shutdown ret = testbase.request_to_shutdown_smr(server_to_del) self.assertEqual(ret, 0, 'failed : shutdown smr. id:%d' % server_to_del['id']) ret = testbase.request_to_shutdown_redis(server_to_del) self.assertEquals( ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id']) util.log('succeeded : shutdown pgs%d.' % server_to_del['id']) # delete pgs from cluster cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key( src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":") + 1:]) seq = int(line[line.rfind(":") + 1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log( ">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+Background saving started\r\n') ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq - mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_master['id']) ret = testbase.request_to_start_redis(dst_master) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_master['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id'])) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown redis') ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % dst_slave['id']) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % dst_slave['id']) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id'])) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse( self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue( util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def pgs_add_and_del(self, upgrade_server, type): util.print_frame() util.log('[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type)) util.log_server_state(self.cluster) # start load generator load_gen_list = {} for i in range(len(self.cluster['servers'])): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 50): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) # attach pgs from cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret)) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) time.sleep(3) # check new values redis = redis_mgmt.Redis(upgrade_server['id']) ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port'])) for i in range(0, 50): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write(cmd) redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i)) util.log('succeeded : check values with get operations on pgs%d.' % (upgrade_server['id'])) # shutdown load generators for i in range(len(load_gen_list)): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state(self.cluster) return 0
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check if pgs is removed success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'succeeded : pgs is removed' ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( self.quorum_policy[1], quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( self.quorum_policy[1], quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() return 0
def test_migration_with_expire_command(self): util.print_frame() util.log("start load_generator") load_gen_thrd_list = {} for i in range(1): ip, port = util.get_rand_gateway(self.cluster) load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port) load_gen_thrd_list[i].start() time.sleep(5) # generate load for 5 sec tps = 20000 src_pg_id = 0 dst_pg_id = 1 leader_cm = self.cluster['servers'][0] src_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', src_pg_id) dst_master = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', dst_pg_id) smr = smr_mgmt.SMR(src_master['id']) ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port']) if ret != 0: util.log('failed to connect to smr(source master)') return False src_redis = redis_mgmt.Redis(src_master['id']) ret = src_redis.connect(src_master['ip'], src_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired') self.assertEquals(res, ":0\r\n") util.log(">>> migrate test with expire command start(%s), ts:%d" % (time.asctime(), ts)) ts = time.time() self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist', 20) # notify dst_redis of migration start util.log(">>> notify dst_redis of migration start (%s)" % time.asctime()) cmd = 'migconf migstart %d-%d\r\n' % (0, 8191) dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) # remote partial checkpoint util.log(">>> start remote checkpoint and load (%s)" % time.asctime()) cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], dst_master['ip'], dst_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: if line.find("Checkpoint Sequence Number:") != -1: util.log("seqnumber : " + line[line.rfind(":")+1:]) seq = int(line[line.rfind(":")+1:]) util.log(">>>" + str(line.rstrip())) self.assertEqual(0, ret) util.log(">>> end remote checkpoint and load (%s)" % time.asctime()) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") # bgsave for testing later about recovery during migration util.log(">>> bgsave for testing later about recovery during migration (%s)" % time.asctime()) cmd = 'bgsave\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+Background saving started\r\n' ) ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist', 20) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~afterCheckpoint:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist', 100) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist', 100) # remote catchup (smr log migration) util.log(">>> start remote catchup (%s)" % time.asctime()) dst_host = dst_master['ip'] dst_smr_port = dst_master['smr_base_port'] rle = '1 8192' num_part = 8192 smr.write('migrate start %s %d %d %d %d %s\r\n' % (dst_host, dst_smr_port, seq, tps, num_part, rle)) response = smr.read_until('\r\n') if response[:3] != '+OK': util.log('failed to execute migrate start command, response:%s' % response) return False while True: smr.write('migrate info\r\n') response = smr.read_until('\r\n') seqs = response.split() logseq = int(seqs[1].split(':')[1]) mig = int(seqs[2].split(':')[1]) util.log('migrate info: %s' % response) if (logseq-mig < 500000): util.log('Remote catchup almost done. try mig2pc') break time.sleep(1) util.log(">>> sleep until 90 sec pass") self.assertFalse(time.time() - ts >= 90) time.sleep(90 - (time.time() - ts)) res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:afterCheckpoint~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist', 20) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(src_redis, 'S3:duringCatchup~duringCatchup:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10) self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired', 10) self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist', 20) util.log(">>> remote catchup phase almost done (%s)" % time.asctime()) # mig2pc util.log(">>> start mig2pc (%s)" % time.asctime()) cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'], src_pg_id, dst_pg_id, 0, 8191) result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd) util.log('mig2pc result : ' + result) if not result.startswith('{"state":"success","msg":"+OK"}\r\n'): util.log('failed to execute mig2pc command, result:%s' % result) return False util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:duringCatchup~afterMig2pc:expired') self.assertEquals(res, ":0\r\n") ts = time.time() self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10) self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10) self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20) # finish migration smr.write('migrate interrupt\r\n') response = smr.read_until('\r\n') util.log('migrate interrupt: %s' % response) smr.disconnect() # notify dst_redis of migration end util.log(">>> notify dst_redis of migration end (%s)" % time.asctime()) cmd = 'migconf migend\r\n' dst_redis.write(cmd) res = dst_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191) src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n' ) util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist') self.assertEquals(res, ":1\r\n") res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired') self.assertEquals(res, ":0\r\n") ts = time.time() util.log(">>> sleep until 15 sec pass") self.assertFalse(time.time() - ts >= 15) time.sleep(15 - (time.time() - ts)) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) # remote partial checkpoint util.log(">>> start rangedel (%s)" % time.asctime()) cmd = "./cluster-util --rangedel %s %d %d-%d %d" % ( src_master['ip'], src_master['redis_port'], 0, 8191, tps) p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd, True, None, subprocess.PIPE, None) ret = p.wait() for line in p.stdout: util.log(">>>" + str(line.rstrip())) cmd = 'migconf clearend\r\n' src_redis.write(cmd) res = src_redis.read_until('\r\n') self.assertEqual(res, '+OK\r\n') time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].quit() for i in range(len(load_gen_thrd_list)): load_gen_thrd_list[i].join() self.assertTrue(load_gen_thrd_list[i].isConsistent(), 'Inconsistent after migration') # kill dst_redis and recover from bgsave util.log(">>> kill dst_redis and recover from bgsave (%s)" % time.asctime()) dst_redis.disconnect() ret = testbase.request_to_shutdown_redis(dst_master) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_master) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_master) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_master['id'] ) ret = testbase.request_to_start_redis(dst_master) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_master['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_master) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_master['id']) ) dst_redis = redis_mgmt.Redis(dst_master['id']) ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis, 'S3:PermanentKey') # kill dst_slave redis and recover without dump file util.log(">>> kill dst_redis and recover without dump file (%s)" % time.asctime()) dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'], 'slave', dst_pg_id) ret = testbase.request_to_shutdown_redis(dst_slave) self.assertEquals( ret, 0, 'failed to shutdown redis' ) ret = testbase.request_to_shutdown_smr(dst_slave) self.assertEquals(ret, 0, 'failed to shutdown smr') time.sleep(5) testbase.request_to_start_smr(dst_slave) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % dst_slave['id'] ) ret = testbase.request_to_start_redis(dst_slave) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % dst_slave['id'] ) ret = testbase.wait_until_finished_to_set_up_role(dst_slave) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (dst_slave['id']) ) dst_redis_slave = redis_mgmt.Redis(dst_slave['id']) ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~beforeCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:beforeCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~afterCheckpoint:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterCheckpoint~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~duringCatchup:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~duringCatchup:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:duringCatchup~afterMig2pc:expired')) self.assertTrue(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired')) self.assertTrue(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:persist')) self.assertFalse(self.isS3Exist(dst_redis_slave, 'S3:afterMig2pc~migrateEnd:expired')) self.getS3TTL(dst_redis_slave, 'S3:PermanentKey') # Go back to initial configuration self.assertTrue(util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000), 'failed to rollback migration')
def test_7_remaining_hbc_connection( self ): util.print_frame() # check pgs for server in self.cluster['servers']: before_cnt_redis = util.get_clients_count_of_redis(server['ip'], server['redis_port']) before_cnt_smr = util.get_clients_count_of_smr(server['smr_mgmt_port']) cmd = 'pgs_leave %s %d forced\r\npgs_del %s %d' % (self.cluster['cluster_name'], server['id'], self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: # check redis success = False for i in range(5): after_cnt = util.get_clients_count_of_redis(server['ip'], server['redis_port']) if after_cnt <= 2: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) util.log( 'succeeded : the number of connections to redis%d(%s:%d) is %d, exptected=n<=2, before=%d' % (server['id'], server['ip'], server['redis_port'], after_cnt, before_cnt_redis) ) # check smr success = False expected = 0 for i in range(5): after_cnt = util.get_clients_count_of_smr(server['smr_mgmt_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) util.log( 'succeeded : the number of connections to smr%d(%s:%d) is %d, exptected=%d, before=%d' % (server['id'], server['ip'], server['smr_mgmt_port'], after_cnt, expected, before_cnt_smr) ) # check gateway for server in self.cluster['servers']: before_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) cmd = 'gw_del %s %d' % (self.cluster['cluster_name'], server['id']) util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) for server in self.cluster['servers']: success = False expected = 1 for i in range(5): after_cnt = util.get_clients_count_of_gw(server['ip'], server['gateway_port']) if after_cnt == expected: success = True break time.sleep(1) self.assertEquals( success, True, 'failed : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) ) util.log( 'succeeded : the number of connections to gateway%d(%s:%d) is %d, exptected=%d.' % (server['id'], server['ip'], server['gateway_port'], after_cnt, expected) ) # Go back to initial configuration # Cleanup PG self.assertTrue(util.cm_success(util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], 'pg_del %s %d' % (self.cluster['cluster_name'], self.cluster['servers'][0]['pg_id'])))[0]) # Cleanup processes of PGS and GW for s in self.cluster['servers']: self.assertEqual(0, util.shutdown_redis(s['id'], s['redis_port']), 'failed to kill redis %d process' % s['id']) self.assertEqual(0, util.shutdown_smr(s['id'], s['ip'], s['smr_base_port']), 'failed to kill smr %d process' % s['id']) self.assertEqual(0, util.shutdown_gateway(s['id'], s['gateway_port']), 'failed to kill gw %d process' % s['id']) # Recover PG self.assertTrue( util.install_pg(self.cluster, self.cluster['servers'], self.cluster['servers'][0], start_gw=True), 'failed to recover PGS and GW in a PM')
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # check if pgs is removed r = util.get_role_of_server(m) if r != c.ROLE_MASTER: success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'pgs is removed' ) # check states of all pgs in pg for i in xrange(10): for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) if real_role != cc_role: time.sleep(0.5) continue for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() # Go back to initial configuration self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']), 'failed to recover pgs, (pgs_join)') return 0