def test_quorum_policy_of_hanging_master( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 15000\r\n' ) time.sleep( 5 ) # wait for forced master election success = False new_master = None for i in range( 7 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to forced master election' ) # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm( server['id'] ) # wait until hanging master wake up time.sleep( 5 ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' %(2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # Go back to initial configuration # Recover Confmaster self.assertTrue(util.recover_confmaster(self.cluster, [0,1,2], 0), 'failed to recover confmaster') return 0
def test_quorum_policy_of_hanging_master( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 15000\r\n' ) time.sleep( 5 ) # wait for forced master election success = False new_master = None for i in range( 7 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to forced master election' ) # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm( server['id'] ) # wait until hanging master wake up time.sleep( 5 ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( self.quorum_policy[1], quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) ) util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( self.quorum_policy[1], quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) return 0
def failover(self, server): # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # recovery ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role))
def failover( self, server ): # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) # recovery ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )
def test_upgrade_smr_repeatedly(self): util.print_frame() execution_count_master = 0 execution_count_slave = 0 old_target = None for cnt in range(5): target = random.choice(self.cluster['servers']) while target == old_target: target = random.choice(self.cluster['servers']) old_target = target role = util.get_role_of_server(target) if role == c.ROLE_SLAVE: ret = util.upgrade_pgs(target, self.leader_cm, self.cluster) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % target['id']) execution_count_master = execution_count_master + 1 elif role == c.ROLE_MASTER: ret = util.upgrade_pgs(target, self.leader_cm, self.cluster) self.assertTrue( ret, 'Failed to upgrade master pgs%d' % target['id']) execution_count_slave = execution_count_slave + 1 else: self.fail('unexpected role:%s' % role) time.sleep(1) m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') if execution_count_master == 0: ret = util.upgrade_pgs(m, self.leader_cm, self.cluster) self.assertTrue(ret, 'Failed to upgrade master pgs%d' % m['id']) if execution_count_slave == 0: ret = util.upgrade_pgs(s2, self.leader_cm, self.cluster) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s2['id'])
def test_pgs_add_and_del_repeatedly(self): util.print_frame() execution_count_master = 0 execution_count_slave = 0 old_target = None for cnt in range(50): target = random.choice(self.cluster['servers']) while target == old_target: target = random.choice(self.cluster['servers']) old_target = target role = util.get_role_of_server(target) if role == c.ROLE_SLAVE: self.pgs_add_and_del(target, 'slave') execution_count_master = execution_count_master + 1 elif role == c.ROLE_MASTER: self.pgs_add_and_del(target, 'master') execution_count_slave = execution_count_slave + 1 else: self.fail('unexpected role:%s' % role)
def test_pgs_add_and_del_repeatedly( self ): util.print_frame() execution_count_master = 0 execution_count_slave = 0 old_target = None for cnt in range( 50 ): target = random.choice( self.cluster['servers'] ) while target == old_target: target = random.choice( self.cluster['servers'] ) old_target = target role = util.get_role_of_server( target ) if role == c.ROLE_SLAVE: self.pgs_add_and_del( target, 'slave' ) execution_count_master = execution_count_master + 1 elif role == c.ROLE_MASTER: self.pgs_add_and_del( target, 'master' ) execution_count_slave = execution_count_slave + 1 else: self.fail( 'unexpected role:%s' % role )
def test_upgrade_smr_repeatedly( self ): util.print_frame() execution_count_master = 0 execution_count_slave = 0 old_target = None for cnt in range( 5 ): target = random.choice( self.cluster['servers'] ) while target == old_target: target = random.choice( self.cluster['servers'] ) old_target = target role = util.get_role_of_server( target ) if role == c.ROLE_SLAVE: ret = util.upgrade_pgs( target, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % target['id']) execution_count_master = execution_count_master + 1 elif role == c.ROLE_MASTER: ret = util.upgrade_pgs( target, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade master pgs%d' % target['id']) execution_count_slave = execution_count_slave + 1 else: self.fail( 'unexpected role:%s' % role ) time.sleep( 1 ) m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) if execution_count_master == 0: ret = util.upgrade_pgs( m, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade master pgs%d' % m['id']) if execution_count_slave == 0: ret = util.upgrade_pgs( s2, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s2['id'])
def test_all_pgs_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave1 = smr_mgmt.SMR( s1['id'] ) ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_slave2 = smr_mgmt.SMR( s2['id'] ) ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) m_ts = util.get_timestamp_of_pgs( m ) s1_ts = util.get_timestamp_of_pgs( s1 ) s2_ts = util.get_timestamp_of_pgs( s2 ) smr_master.write( 'fi delay sleep 1 8000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave1.write( 'fi delay sleep 1 8000\r\n' ) smr_slave2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 10 ) # wait for forced master election success = False master = None for i in range( 20 ): role = util.get_role_of_server( s1 ) ts = util.get_timestamp_of_pgs( s1 ) if role == c.ROLE_MASTER and ts == s1_ts: master = s1 success = True break role = util.get_role_of_server( s2 ) ts = util.get_timestamp_of_pgs( s2 ) if role == c.ROLE_MASTER and ts == s2_ts: master = s2 success = True break role = util.get_role_of_server( m ) ts = util.get_timestamp_of_pgs( m ) if role == c.ROLE_MASTER and ts == m_ts: master = m success = True break time.sleep( 1 ) m_ts = util.get_timestamp_of_pgs( m ) s1_ts = util.get_timestamp_of_pgs( s1 ) s2_ts = util.get_timestamp_of_pgs( s2 ) self.assertEqual( success, True, 'failed to forced master election' ) servers = [m, s1, s2] for s in servers: if s != master: for i in range( 20 ): role = util.get_role_of_server( s ) if role == c.ROLE_SLAVE: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave, %s:%d' % (s['ip'], s['smr_mgmt_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # set values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0 .write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check new values (s2) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) ) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def state_transition( self ): server = util.get_server_by_role( self.cluster['servers'], 'slave' ) self.assertNotEquals( server, None, 'failed to get_server_by_role-slave' ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) # check initial state state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role) ) # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEquals( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 3 ) # check state F expected = 'F' state = self.get_expected_smr_state( server, expected ) self.assertEquals( expected , state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected) ) # set value ret = gw.connect( ip, port ) self.assertEquals( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) timestamp = 0.0 for i in range( 0, 100 ): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery ret = testbase.request_to_start_smr( server ) self.assertEquals( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEquals( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) time.sleep( 5 ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N expected = 'N' max_try = 20 for i in range( 0, max_try ): state = self.get_expected_smr_state( server, expected ) if state == expected: break time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected) )
def elect_master_randomly( self ): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 1000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) server_ids = [] for server in self.cluster['servers']: server_ids.append( server['id'] ) for try_cnt in range( 30 ): # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'master id : %d' % m['id'] ) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove( m['id'] ) smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) cmd = 'role lconn\r\n' smr.write( cmd ) reply = smr.read_until( '\r\n' ) self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) # wait until role-change is finished for role_change_try_cnt in range( 5 ): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server( server ) real_role = util.roleNumberToChar( real_role ) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break; time.sleep( 1 ) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) ) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) ) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range( 3 ): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) if ok == False: time.sleep( 0.5 ) else: break self.assertTrue( ok, 'failed : role check' ) if len( server_ids ) == 0: util.log( 'succeeded : all smrs have been as a master' ) return 0 self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids)) ) return 0
def consistent_after_failover( self ): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) gw.connect( ip, port ) for i in range( 0, max ): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) time.sleep( 5 ) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 5 ) # check state F for server in servers: state = self.get_expected_smr_state( server, 'F' ) self.assertEquals( 'F', state, 'server%d - state:%s' % (server['id'], state) ) # recovery for server in servers: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] ) ret = testbase.request_to_start_redis( server, False ) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id'] ) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep( 5 ) # wait for master election for i in xrange(10): ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] ) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) ) state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role) ) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server( server ) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave) ) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] ) for i in range( max, max*2 ): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id'] ) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis( slave['id'] ) ret = slave_redis .connect( slave['ip'], slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] ) for i in range( 0, max*2 ): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write( cmd ) trash = slave_redis.read_until( '\r\n' ) res = slave_redis.read_until( '\r\n' ) self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res) ) slave_redis.disconnect()
def test_4_PGS_mgen_is_less_than_PG_mgen(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role(self.cluster['servers'], 'master') ret = testbase.request_to_shutdown_smr(server_to_join) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server_to_join) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server_to_join, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected)) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue(ok, 'Check quorum fail.') # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr(server_to_join) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server_to_join, 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id'])) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) time.sleep(5) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') server = util.get_server_by_role(self.cluster['servers'], 'master') redis = redis_mgmt.Redis(server_to_join['id']) ret = redis.connect(server_to_join['ip'], server_to_join['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i)) gw.disconnect() return 0
def test_two_slaves_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs(s1) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1)) ts_before2 = util.get_timestamp_of_pgs(s2) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2)) # hang smr1 = smr_mgmt.SMR(s1['id']) ret = smr1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr2 = smr_mgmt.SMR(s2['id']) ret = smr2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr1.write('fi delay sleep 1 8000\r\n') reply = smr1.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr2.write('fi delay sleep 1 8000\r\n') time.sleep(7) # wait for rejoin as a slave success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(s1) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) success = False for i in range(20): role = util.get_role_of_server(s2) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(s2) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_all_pgs_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave1 = smr_mgmt.SMR(s1['id']) ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_slave2 = smr_mgmt.SMR(s2['id']) ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) smr_master.write('fi delay sleep 1 8000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave1.write('fi delay sleep 1 8000\r\n') smr_slave2.write('fi delay sleep 1 8000\r\n') time.sleep(10) # wait for forced master election success = False master = None for i in range(20): role = util.get_role_of_server(s1) ts = util.get_timestamp_of_pgs(s1) if role == c.ROLE_MASTER and ts == s1_ts: master = s1 success = True break role = util.get_role_of_server(s2) ts = util.get_timestamp_of_pgs(s2) if role == c.ROLE_MASTER and ts == s2_ts: master = s2 success = True break role = util.get_role_of_server(m) ts = util.get_timestamp_of_pgs(m) if role == c.ROLE_MASTER and ts == m_ts: master = m success = True break time.sleep(1) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) self.assertEqual(success, True, 'failed to forced master election') servers = [m, s1, s2] for s in servers: if s != master: for i in range(20): role = util.get_role_of_server(s) if role == c.ROLE_SLAVE: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave, %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check new values (s2) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i)) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def state_transition(self): server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(server, None, 'failed to get_server_by_role-slave') # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) # check initial state state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role)) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEquals(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(3) # check state F expected = 'F' state = self.get_expected_smr_state(server, expected) self.assertEquals( expected, state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected)) # set value ret = gw.connect(ip, port) self.assertEquals(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) timestamp = 0.0 for i in range(0, 100): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery ret = testbase.request_to_start_smr(server) self.assertEquals(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEquals(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) time.sleep(5) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N expected = 'N' max_try = 20 for i in range(0, max_try): state = self.get_expected_smr_state(server, expected) if state == expected: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected))
def consistent_after_failover(self): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) for i in range(0, max): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') time.sleep(5) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr(server) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(5) # check state F for server in servers: state = self.get_expected_smr_state(server, 'F') self.assertEquals('F', state, 'server%d - state:%s' % (server['id'], state)) # recovery for server in servers: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % server['id']) ret = testbase.request_to_start_redis(server, False) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % server['id']) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep(5) # wait for master election for i in xrange(10): ret = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id'])) state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role)) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server(server) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave)) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis(master['id']) ret = redis.connect(master['ip'], master['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id']) for i in range(max, max * 2): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id']) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis(slave['id']) ret = slave_redis.connect(slave['ip'], slave['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id']) for i in range(0, max * 2): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write(cmd) trash = slave_redis.read_until('\r\n') res = slave_redis.read_until('\r\n') self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)) slave_redis.disconnect()
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check if pgs is removed success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'succeeded : pgs is removed' ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( self.quorum_policy[1], quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( self.quorum_policy[1], quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() return 0
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr(server_to_join[i]) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis(server_to_join[i]) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected)) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr(master) self.assertEqual(ret, 0, 'failed to shutdown smr') util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis(master) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected)) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id'])) # check state N max_try = 20 expected = 'N' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server_to_join[i]) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role)) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis(['id']) ret = redis.connect(s['ip'], s['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis(server_to_join[i]['id']) ret = redis.connect(server_to_join[i]['ip'], server_to_join[i]['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j)) # try to recover master, but failed ret = testbase.request_to_start_smr(master) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(master, False) self.assertEqual(ret, 0, 'failed to start redis') max_try = 3 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(master) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role)) util.log( 'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.' ) gw.disconnect() return 0
def master_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(5) # wait for forced master election success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True break time.sleep(1) util.log('server state transition after hang') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to forced master election') redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check if the haning server recovered and joined as a slave time.sleep(7) role = util.get_role_of_server(m) self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave') redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def failover_while_hang(self, server): # timestamp before hang ts_before = util.get_timestamp_of_pgs(server) self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before)) # hang util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port'])) smr = smr_mgmt.SMR(server['id']) ret = smr.connect(server['ip'], server['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(4) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to F.' % server['id']) # shutdown util.log('shutdown pgs%d while hanging.' % server['id']) ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr. id:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis. id:%d' % server['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to F.' % server['id']) # recovery util.log('restart pgs%d.' % server['id']) ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr. id:%d' % server['id']) ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis. id:%d' % server['id']) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) util.log('succeeded : pgs%d state changed to N.' % server['id']) # wait for rejoin as a slave success = False for i in range(20): role = util.get_role_of_server(server) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(server) if ts_after != -1 and ts_before != ts_after: success = True break time.sleep(1) self.assertEqual(success, True, 'failed to rejoin as a slave') util.log('succeeded : pgs%d joined as a slave.' % server['id']) return 0
def __del_server(self, server_to_del): # backup data redis = redis_mgmt.Redis( server_to_del['id'] ) ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) # bgsave ret = util.bgsave(server_to_del) self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id']) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) r = util.get_role_of_server(server_to_del) # If quorum of left master is larger than 1, info command will be blocked. if r != c.ROLE_MASTER: # check if pgs is removed success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( server_to_del['id'] ) ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'pgs is removed' ) # change state of pgs to lconn cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # shutdown ret = testbase.request_to_shutdown_smr( server_to_del ) self.assertEqual( ret, 0, 'failed : shutdown smr. id:%d' % server_to_del['id'] ) ret = testbase.request_to_shutdown_redis( server_to_del ) self.assertEquals( ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id'] ) util.log('succeeded : shutdown pgs%d.' % server_to_del['id'] ) # delete pgs from cluster cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
def elect_master_randomly(self): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 1000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) server_ids = [] for server in self.cluster['servers']: server_ids.append(server['id']) for try_cnt in range(30): # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('master id : %d' % m['id']) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove(m['id']) smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) cmd = 'role lconn\r\n' smr.write(cmd) reply = smr.read_until('\r\n') self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) # wait until role-change is finished for role_change_try_cnt in range(5): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server(server) real_role = util.roleNumberToChar(real_role) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break time.sleep(1) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave)) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave)) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range(3): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) if ok == False: time.sleep(0.5) else: break self.assertTrue(ok, 'failed : role check') if len(server_ids) == 0: util.log('succeeded : all smrs have been as a master') return 0 self.assertEqual( 0, len(server_ids), 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))) return 0
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # check if pgs is removed r = util.get_role_of_server(m) if r != c.ROLE_MASTER: success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'pgs is removed' ) # check states of all pgs in pg for i in xrange(10): for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) if real_role != cc_role: time.sleep(0.5) continue for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() # Go back to initial configuration self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']), 'failed to recover pgs, (pgs_join)') return 0
def __del_server(self, server_to_del): # backup data redis = redis_mgmt.Redis(server_to_del['id']) ret = redis.connect(server_to_del['ip'], server_to_del['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) # bgsave ret = util.bgsave(server_to_del) self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id']) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) r = util.get_role_of_server(server_to_del) # If quorum of left master is larger than 1, info command will be blocked. if r != c.ROLE_MASTER: # check if pgs is removed success = False for try_cnt in range(10): redis = redis_mgmt.Redis(server_to_del['id']) ret = redis.connect(server_to_del['ip'], server_to_del['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) util.log('succeeded : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) redis.write('info stats\r\n') for i in range(6): redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'])) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'], res[:-2])) no = int(res.split(':')[1]) if no <= 100: success = True break time.sleep(1) self.assertEquals(success, True, 'failed : pgs does not removed.') util.log('pgs is removed') # change state of pgs to lconn cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # shutdown ret = testbase.request_to_shutdown_smr(server_to_del) self.assertEqual(ret, 0, 'failed : shutdown smr. id:%d' % server_to_del['id']) ret = testbase.request_to_shutdown_redis(server_to_del) self.assertEquals( ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id']) util.log('succeeded : shutdown pgs%d.' % server_to_del['id']) # delete pgs from cluster cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
def failover_while_hang( self, server ): # timestamp before hang ts_before = util.get_timestamp_of_pgs( server ) self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before) ) # hang util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port'])) smr = smr_mgmt.SMR( server['id'] ) ret = smr.connect( server['ip'], server['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 4 ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # shutdown util.log( 'shutdown pgs%d while hanging.' % server['id'] ) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # recovery util.log( 'restart pgs%d.' % server['id'] ) ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to N.' % server['id'] ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( server ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( server ) if ts_after != -1 and ts_before != ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave' ) util.log( 'succeeded : pgs%d joined as a slave.' % server['id'] ) return 0
def test_4_PGS_mgen_is_less_than_PG_mgen( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' ) ret = testbase.request_to_shutdown_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server_to_join ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server_to_join, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) ) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue( ok, 'Check quorum fail.' ) # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role( self.cluster['servers'], 'master' ) self.failover( server ) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr( server_to_join ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) ) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) time.sleep( 5 ) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) server = util.get_server_by_role( self.cluster['servers'], 'master' ) redis = redis_mgmt.Redis( server_to_join['id'] ) ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) ) gw.disconnect() return 0
def failure_recovery(self, role, wait_count=10, redis_only=False): time.sleep(2) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # shutdown server = util.get_server_by_role(self.cluster['servers'], role) if redis_only == False: ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) # check value cmd = 'get %s\r\n' % (key) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value))
def test_two_slaves_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs( s1 ) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) ) ts_before2 = util.get_timestamp_of_pgs( s2 ) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) ) # hang smr1 = smr_mgmt.SMR( s1['id'] ) ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr2 = smr_mgmt.SMR( s2['id'] ) ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr1.write( 'fi delay sleep 1 8000\r\n' ) reply = smr1.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 7 ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s1 ) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) success = False for i in range( 20 ): role = util.get_role_of_server( s2 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s2 ) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis( server_to_join[i] ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) ) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis( master ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected) ) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) ) # check state N max_try = 20 expected = 'N' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server_to_join[i] ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) ) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis( ['id'] ) ret = redis.connect( s['ip'], s['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis( server_to_join[i]['id'] ) ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) ) # try to recover master, but failed ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( master, False ) self.assertEqual( ret, 0, 'failed to start redis' ) max_try = 3 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( master ) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) ) util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.') gw.disconnect() return 0
def master_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 5 ) # wait for forced master election success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True break time.sleep( 1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to forced master election' ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check if the haning server recovered and joined as a slave time.sleep( 7 ) role = util.get_role_of_server( m ) self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def failure_recovery( self, role, wait_count=10, redis_only=False ): time.sleep( 2 ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) # shutdown server = util.get_server_by_role( self.cluster['servers'], role ) if redis_only == False: ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check state N max_try = 20 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( server, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) ) # check value cmd = 'get %s\r\n' % (key) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value) )