def test_2_role_change_with_hanging_pgs(self): util.print_frame() i = 0 while i < 5: util.log('') util.log('Loop:%d' % i) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') hang = random.choice(self.cluster['servers']) if hang == m: hanging_servers = [m] running_servers = [s1, s2] type = 'master' else: hanging_servers = [s1] running_servers = [m, s2] type = 'slave' s = random.choice([s1, s2]) util.log('hanging pgs(id:%d, type:%s), expected_master_id:%d' % (hang['id'], type, s['id'])) self.role_change_with_hanging_pgs(hanging_servers, running_servers, s['id'], m) i += 1
def test_quorum_policy_of_hanging_master( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 15000\r\n' ) time.sleep( 5 ) # wait for forced master election success = False new_master = None for i in range( 7 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to forced master election' ) # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm( server['id'] ) # wait until hanging master wake up time.sleep( 5 ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' %(2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # Go back to initial configuration # Recover Confmaster self.assertTrue(util.recover_confmaster(self.cluster, [0,1,2], 0), 'failed to recover confmaster') return 0
def test_upgrade_slave_smr( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) ret = util.upgrade_pgs( s1, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s1['id'])
def test_3_role_change_while_all_pgs_hanging(self): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') hanging_servers = [m, s1, s2] running_servers = [] s = random.choice([s1, s2]) self.role_change_with_hanging_pgs(hanging_servers, running_servers, s['id'], m) return 0
def test_upgrade_smr_repeatedly( self ): util.print_frame() execution_count_master = 0 execution_count_slave = 0 old_target = None for cnt in range( 5 ): target = random.choice( self.cluster['servers'] ) while target == old_target: target = random.choice( self.cluster['servers'] ) old_target = target role = util.get_role_of_server( target ) if role == c.ROLE_SLAVE: ret = util.upgrade_pgs( target, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % target['id']) execution_count_master = execution_count_master + 1 elif role == c.ROLE_MASTER: ret = util.upgrade_pgs( target, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade master pgs%d' % target['id']) execution_count_slave = execution_count_slave + 1 else: self.fail( 'unexpected role:%s' % role ) time.sleep( 1 ) m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) if execution_count_master == 0: ret = util.upgrade_pgs( m, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade master pgs%d' % m['id']) if execution_count_slave == 0: ret = util.upgrade_pgs( s2, self.leader_cm, self.cluster ) self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s2['id'])
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi)) # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) finally: for c in clnts: c.quit() for c in clnts: c.join()
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( target ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps= {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) util.log('start smr-replicator done') ret = testbase.request_to_start_redis( target, 60 ) self.assertEqual( ret, 0, 'failed to start redis' ) util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check quorum expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 1 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state( self.cluster ) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Get new timestamp new_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Compare old timestamps and new timestamps for s in self.cluster['servers']: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] if master in hanging_servers and len(running_servers) != 0: self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts)) else: self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_quorum_policy_of_hanging_master(self): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 15000\r\n') time.sleep(5) # wait for forced master election success = False new_master = None for i in range(7): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep(1) self.assertEqual(success, True, 'failed to forced master election') # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm(server['id']) # wait until hanging master wake up time.sleep(5) # check quorum policy quorum_of_haning_master = util.get_quorum(m) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' % (2, quorum_of_haning_master)) util.log('succeeded : quorum of haning master=%d' % quorum_of_haning_master) # check quorum policy quorum_of_new_master = util.get_quorum(new_master) self.assertNotEqual(None, quorum_of_new_master, 'failed : find new master') self.assertEqual( 1, quorum_of_new_master, 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master)) util.log('succeeded : quorum of new master=%d' % quorum_of_new_master) return 0
def test_quorum(self): util.print_frame() master, slave1, slave2 = util.get_mss(self.cluster) expected = 2 max_try = 20 for i in range(0, max_try): quorum = util.get_quorum(master) if quorum == expected: break time.sleep(1) self.assertEquals(quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected)) ret = testbase.request_to_shutdown_smr(slave1) self.assertEqual(ret, 0, 'failed to shutdown smr, server:%d' % slave1['id']) time.sleep(1) expected = 1 max_try = 20 for i in range(0, max_try): master = util.get_server_by_role(self.cluster['servers'], 'master') quorum = util.get_quorum(master) if quorum == expected: break time.sleep(1) self.assertEquals(quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected)) ret = testbase.request_to_shutdown_smr(slave2) self.assertEqual(ret, 0, 'failed to shutdown smr, server:%d' % slave2['id']) time.sleep(1) expected = 0 max_try = 20 for i in range(0, max_try): master = util.get_server_by_role(self.cluster['servers'], 'master') quorum = util.get_quorum(master) if quorum == expected: break time.sleep(1) self.assertEquals(quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected)) # recovery ret = testbase.request_to_start_smr(slave1) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(slave1) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(slave1) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (slave1['id'])) time.sleep(1) expected = 1 max_try = 20 for i in range(0, max_try): quorum = util.get_quorum(master) if quorum == expected: break time.sleep(1) self.assertEquals(quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected)) # recovery ret = testbase.request_to_start_smr(slave2) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(slave2) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(slave2) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (slave2['id'])) time.sleep(1) expected = 2 max_try = 20 for i in range(0, max_try): quorum = util.get_quorum(master) if quorum == expected: break time.sleep(1) self.assertEquals(quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected))
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr(server_to_join[i]) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis(server_to_join[i]) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected)) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr(master) self.assertEqual(ret, 0, 'failed to shutdown smr') util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis(master) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected)) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id'])) # check state N max_try = 20 expected = 'N' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server_to_join[i]) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role)) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis(['id']) ret = redis.connect(s['ip'], s['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis(server_to_join[i]['id']) ret = redis.connect(server_to_join[i]['ip'], server_to_join[i]['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j)) # try to recover master, but failed ret = testbase.request_to_start_smr(master) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(master, False) self.assertEqual(ret, 0, 'failed to start redis') max_try = 3 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(master) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role)) util.log( 'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.' ) gw.disconnect() return 0
def test_quorum( self ): util.print_frame() master, slave1, slave2 = util.get_mss(self.cluster) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave1 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave2 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] ) time.sleep( 1 ) expected = 0 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave1 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave1 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave1 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave2 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave2 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave2 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) ) time.sleep( 1 ) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) )
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me' and fi[1] == 'lconn': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'qa', 'setquorum' elif fi[0] == 'qa' and fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) # shutdown ret = testbase.request_to_shutdown_smr(s1) self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id'])) ret = testbase.request_to_shutdown_redis(s1) self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id'])) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # Modify quorum ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n') self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi)) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # recovery ret = testbase.request_to_start_smr(s1) self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi)) ret = testbase.request_to_start_redis(s1, max_try=120) self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi)) ret = testbase.wait_until_finished_to_set_up_role(s1, 11) self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id'])) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi)) # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) finally: for c in clnts: c.quit() for c in clnts: c.join()
def master_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(5) # wait for forced master election success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True break time.sleep(1) util.log('server state transition after hang') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to forced master election') redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check if the haning server recovered and joined as a slave time.sleep(7) role = util.get_role_of_server(m) self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave') redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_failover_while_hang(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) self.failover_while_hang(m) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(m['id']) ret = redis1.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (m['id'], s2['id'])) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_all_pgs_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave1 = smr_mgmt.SMR(s1['id']) ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_slave2 = smr_mgmt.SMR(s2['id']) ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) smr_master.write('fi delay sleep 1 8000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave1.write('fi delay sleep 1 8000\r\n') smr_slave2.write('fi delay sleep 1 8000\r\n') time.sleep(10) # wait for forced master election success = False master = None for i in range(20): role = util.get_role_of_server(s1) ts = util.get_timestamp_of_pgs(s1) if role == c.ROLE_MASTER and ts == s1_ts: master = s1 success = True break role = util.get_role_of_server(s2) ts = util.get_timestamp_of_pgs(s2) if role == c.ROLE_MASTER and ts == s2_ts: master = s2 success = True break role = util.get_role_of_server(m) ts = util.get_timestamp_of_pgs(m) if role == c.ROLE_MASTER and ts == m_ts: master = m success = True break time.sleep(1) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) self.assertEqual(success, True, 'failed to forced master election') servers = [m, s1, s2] for s in servers: if s != master: for i in range(20): role = util.get_role_of_server(s) if role == c.ROLE_SLAVE: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave, %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check new values (s2) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i)) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def test_two_slaves_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs(s1) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1)) ts_before2 = util.get_timestamp_of_pgs(s2) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2)) # hang smr1 = smr_mgmt.SMR(s1['id']) ret = smr1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr2 = smr_mgmt.SMR(s2['id']) ret = smr2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr1.write('fi delay sleep 1 8000\r\n') reply = smr1.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr2.write('fi delay sleep 1 8000\r\n') time.sleep(7) # wait for rejoin as a slave success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(s1) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) success = False for i in range(20): role = util.get_role_of_server(s2) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs(s2) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep(1) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_and_slave_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave = smr_mgmt.SMR(s1['id']) ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_master.write('fi delay sleep 1 10000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave.write('fi delay sleep 1 10000\r\n') util.log('server state transition after hang') util.log_server_state(self.cluster) time.sleep(5) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range(15): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role == 'M': success = False break time.sleep(1) util.log('') util.log( 'It expects %s:%d is a slave, it can not transit to a master because it violate copy-quorum. (c:3, q:1, a:1)' ) util.log('') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to check copy-quorum') ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue(ok, 'Cluster state is not normal!') redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write(cmd) res = redis2.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port'])) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port'])) if len(self.cluster['servers']) != 3: # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_5_transfer_pgs_to_another_machine(self): util.print_frame() self.load_gen_list = {} # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # incrase master generation number util.log('failover in order to increase master generation number.') max = 0 for i in range(5): key_base = 'key' for i in range(max, max+10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) max = max + 10000 m = util.get_server_by_role(self.cluster['servers'], 'master') util.log('failover pgs%d' % m['id']) ret = util.failover(m, self.leader_cm) self.assertTrue(ret, 'failed to failover pgs%d' % m['id']) # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") m, s1, s2 = util.get_mss(self.cluster) servers = [m, s1, s2] # bgsave for s in servers: ret = util.bgsave(s) self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id']) new_servers = [config.server4, config.server5] # add new slaves for s in new_servers: util.log('delete pgs%d`s check point.' % s['id']) util.del_dumprdb(s['id']) ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191) self.assertEqual(True, ret, 'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % ( m['ip'], m['redis_port'], s['id'])) ret = util.pgs_add(self.cluster, s, self.leader_cm, 0, rm_ckpt=False) self.assertEqual(True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id']) util.log('succeeeded : add a new slave, pgsid=%d' % s['id']) # check consistency ok = True for j in range(self.max_load_generator): if self.load_gen_list[j].isConsistent() == False: ok = False break if not ok: break; for server_to_del in servers: for s in servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) for s in new_servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) self.__del_server(server_to_del) util.log('succeeded : delete pgs%d' % server_to_del['id']) new_m = util.get_server_by_role(new_servers, 'master') new_s = util.get_server_by_role(new_servers, 'slave') self.assertNotEqual( new_m, None, 'master is None.' ) self.assertNotEqual( new_s, None, 'slave is None.' ) for s in new_servers: util.pingpong( s['ip'], s['smr_mgmt_port'] ) time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_two_slaves_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs( s1 ) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) ) ts_before2 = util.get_timestamp_of_pgs( s2 ) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) ) # hang smr1 = smr_mgmt.SMR( s1['id'] ) ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr2 = smr_mgmt.SMR( s2['id'] ) ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr1.write( 'fi delay sleep 1 8000\r\n' ) reply = smr1.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 7 ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s1 ) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) success = False for i in range( 20 ): role = util.get_role_of_server( s2 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s2 ) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: util.iptables_print_list() # Add forwarding role self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.') cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) # Go back to initial configuration cmfi.init() for fi in cmfi: try: self.assertTrue(fi_confmaster.fi_add(fi, 0, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Wait until workflows done ret = util.await(60, True)( lambda cinfo: cinfo['wf'] == 0, lambda : util.cluster_info(mgmt_ip, mgmt_port, cluster['cluster_name'])) self.assertTrue(ret, 'There are still some workflows.') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster) finally: for c in clnts: c.quit() for c in clnts: c.join() # Delete forwarding role self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')
def master_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 5 ) # wait for forced master election success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True break time.sleep( 1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to forced master election' ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check if the haning server recovered and joined as a slave time.sleep( 7 ) role = util.get_role_of_server( m ) self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # check if pgs is removed r = util.get_role_of_server(m) if r != c.ROLE_MASTER: success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'pgs is removed' ) # check states of all pgs in pg for i in xrange(10): for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) if real_role != cc_role: time.sleep(0.5) continue for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() # Go back to initial configuration self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']), 'failed to recover pgs, (pgs_join)') return 0
def test_quorum_with_left_pgs(self): util.print_frame() # start load generators load_gen_list = {} for i in range(len(self.cluster['servers'])): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') # detach pgs from cluster cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # check quorum policy quorum_of_haning_master = util.get_quorum(m) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master)) util.log('succeeded : quorum of left master=%d' % quorum_of_haning_master) # check if pgs is removed r = util.get_role_of_server(m) if r != c.ROLE_MASTER: success = False for try_cnt in range(10): redis = redis_mgmt.Redis(m['id']) ret = redis.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port'])) util.log('succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port'])) redis.write('info stats\r\n') for i in range(6): redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port'])) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2])) no = int(res.split(':')[1]) if no <= 100: success = True break time.sleep(1) self.assertEquals(success, True, 'failed : pgs does not removed.') util.log('pgs is removed') # check states of all pgs in pg for i in xrange(10): for s in self.cluster['servers']: smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) if real_role != cc_role: time.sleep(0.5) continue for s in self.cluster['servers']: smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role)) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role)) # check quorum policy quorum_of_haning_master = util.get_quorum(m) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master)) util.log('succeeded : quorum of left master=%d' % quorum_of_haning_master) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr(m, cmd) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # wait for master election success = False new_master = None for i in range(10): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep(1) self.assertEqual(success, True, 'failed to elect new master') util.log('succeeded : elect new master, master_id=%d' % new_master['id']) time.sleep(1) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server(s) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role)) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role)) # check quorum policy quorum_of_new_master = util.get_quorum(new_master) self.assertNotEqual(None, quorum_of_new_master, 'failed : find new master') self.assertEqual( 1, quorum_of_new_master, 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master)) util.log('succeeded : quorum of new master=%d' % quorum_of_new_master) # shutdown load generators for i in range(len(load_gen_list)): load_gen_list[i].quit() load_gen_list[i].join() return 0
def master_and_slave_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave = smr_mgmt.SMR( s1['id'] ) ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_master.write( 'fi delay sleep 1 10000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave.write( 'fi delay sleep 1 10000\r\n' ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) time.sleep( 5 ) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range( 15 ): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep( 1 ) util.log( '' ) util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' ) util.log( '' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to check copy-quorum' ) ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue( ok, 'Cluster state is not normal!' ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write( cmd ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) ) if len(self.cluster['servers']) != 3: # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis( server_to_join[i] ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) ) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis( master ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected) ) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server_to_join[i] ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) ) # check state N max_try = 20 expected = 'N' for j in range( 0, max_try): state = util.get_smr_state( server_to_join[i], self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( server_to_join[i] ) self.assertEquals( expected , state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) ) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis( ['id'] ) ret = redis.connect( s['ip'], s['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis( server_to_join[i]['id'] ) ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write( cmd ) redis.read_until( '\r\n' ) response = redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) ) # try to recover master, but failed ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( master, False ) self.assertEqual( ret, 0, 'failed to start redis' ) max_try = 3 expected = 'N' for i in range( 0, max_try): state = util.get_smr_state( master, self.leader_cm ) if expected == state: break; time.sleep( 1 ) role = util.get_role_of_server( master ) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) ) util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.') gw.disconnect() return 0
def test_all_pgs_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave1 = smr_mgmt.SMR( s1['id'] ) ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_slave2 = smr_mgmt.SMR( s2['id'] ) ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) m_ts = util.get_timestamp_of_pgs( m ) s1_ts = util.get_timestamp_of_pgs( s1 ) s2_ts = util.get_timestamp_of_pgs( s2 ) smr_master.write( 'fi delay sleep 1 8000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave1.write( 'fi delay sleep 1 8000\r\n' ) smr_slave2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 10 ) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # set values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0 .write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check new values (s2) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) ) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def elect_master_randomly( self ): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 1000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) server_ids = [] for server in self.cluster['servers']: server_ids.append( server['id'] ) for try_cnt in range( 30 ): # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'master id : %d' % m['id'] ) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove( m['id'] ) smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) cmd = 'role lconn\r\n' smr.write( cmd ) reply = smr.read_until( '\r\n' ) self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) ) # wait until role-change is finished for role_change_try_cnt in range( 5 ): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server( server ) real_role = util.roleNumberToChar( real_role ) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break; time.sleep( 1 ) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) ) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) ) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range( 3 ): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) ) if ok == False: time.sleep( 0.5 ) else: break self.assertTrue( ok, 'failed : role check' ) if len( server_ids ) == 0: util.log( 'succeeded : all smrs have been as a master' ) return 0 self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids)) ) return 0
def slave_failover_while_hang( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) self.failover_while_hang( s1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_5_mgmt_is_isolated_with_master_failover(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for loop_cnt in range(3): master, slave1, slave2 = util.get_mss(cluster) self.assertNotEquals(master, None, 'there is no master') self.assertNotEquals(slave1, None, 'there is no slave1') self.assertNotEquals(slave2, None, 'there is no slave2') # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Shutdown master util.log( 'shutdown pgs%d while hanging.' % master['id'] ) ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % master['id'] ) ret = testbase.request_to_shutdown_redis( master ) self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % master['id'] ) # Check state F max_try = 20 expected = 'F' for i in range( 0, max_try): util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port)) state = util._get_smr_state( master['id'], cluster['cluster_name'], mgmt_ip, mgmt_port ) if expected == state: break; time.sleep( 1 ) self.assertEqual( expected , state, 'master%d - state:%s, expected:%s' % (master['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % master['id'] ) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == master['id']: continue if s['active_role'] != s['mgmt_role']: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Recovery util.log( 'restart pgs%d.' % master['id'] ) ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % master['id'] ) ret = testbase.request_to_start_redis( master ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % master['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( master, wait_count ) self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (master['id']) ) redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis' ) ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(0, 3): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) for i in range(3, 6): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize') # Delete forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)