def unblock_network(cluster, mgmt_ip, mgmt_port, final_state): # Unblock if util.iptables_drop('D', '127.0.0.100') == False: util.log('delete a bloking role to iptables fail.') return False # Check cluster state for i in range(3): util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state) time.sleep(1) return True
def unblock_network(cluster, mgmt_ip, mgmt_port, final_state): # Unblock out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') if out.succeeded == False: util.log('delete a bloking role to iptables fail. output:%s' % out) return False # Check cluster state for i in range(3): util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state) time.sleep(1) return True
def block_network(cluster, mgmt_ip, mgmt_port): # Block if util.iptables_drop('A', '127.0.0.100') == False: util.log('add a bloking role to iptables fail.') return False for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state for i in range(2): util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port) time.sleep(1) return True
def block_network(cluster, mgmt_ip, mgmt_port): # Block out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') if out.succeeded == False: util.log('add a bloking role to iptables fail. output:%s' % out) return False for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state for i in range(2): util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port) time.sleep(1) return True
def __check_cluster_state(self): leader_cm = classify_cm(self.__servers)['leader'][0] for cluster in self.__cmi['cluster']: ret = util. await (30)(lambda x: x, lambda: util.check_cluster( cluster['clusterName'].encode('ascii'), leader_cm['ip'], leader_cm['port'], None, True)) if ret == False: return False return True
def __check_cluster_state(self): leader_cm = classify_cm(self.__servers)['leader'][0] for cluster in self.__cmi['cluster']: ret = util.await(30)( lambda x: x, lambda : util.check_cluster(cluster['clusterName'].encode('ascii'), leader_cm['ip'], leader_cm['port'], None, True)) if ret == False: return False return True
def slave_failover_while_hang( self ): util.print_frame() # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) self.failover_while_hang( s1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_and_slave_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave = smr_mgmt.SMR( s1['id'] ) ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_master.write( 'fi delay sleep 1 10000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave.write( 'fi delay sleep 1 10000\r\n' ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) time.sleep( 5 ) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range( 15 ): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep( 1 ) util.log( '' ) util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' ) util.log( '' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to check copy-quorum' ) ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue( ok, 'Cluster state is not normal!' ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write( cmd ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) ) if len(self.cluster['servers']) != 3: # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_all_pgs_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr_master = smr_mgmt.SMR( m['id'] ) ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr_slave1 = smr_mgmt.SMR( s1['id'] ) ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr_slave2 = smr_mgmt.SMR( s2['id'] ) ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) m_ts = util.get_timestamp_of_pgs( m ) s1_ts = util.get_timestamp_of_pgs( s1 ) s2_ts = util.get_timestamp_of_pgs( s2 ) smr_master.write( 'fi delay sleep 1 8000\r\n' ) reply = smr_master.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr_slave1.write( 'fi delay sleep 1 8000\r\n' ) smr_slave2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 10 ) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) # set values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0 .write( cmd ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values (m) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) ) # check new values (s1) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write( cmd ) redis1.read_until( '\r\n' ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) ) # check new values (s2) for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) ) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def test_1_mgmt_is_isolated(self): util.print_frame() util.iptables_print_list() cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Set SMR option (slave_idle_timeout) util.log('\n\n\n ### Set SMR option ###') for s in cluster['servers']: t = telnet.Telnet('SMR%d' % s['id']) self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Network isolation test for cnt in range(5): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt) for s in cluster['servers']: """Loopback Address Range (Reference : RFC3330) 127.0.0.0/8 - This block is assigned for use as the Internet host loopback address. A datagram sent by a higher level protocol to an address anywhere within this block should loop back inside the host. This is ordinarily implemented using only 127.0.0.1/32 for loopback, but no addresses within this block should ever appear on any network anywhere [RFC1700, page 5]. """ self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.') for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt) for s in cluster['servers']: self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) all_green = True for s in final_state: if is_pgs_normal(s) == False: all_green = False if all_green: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 2 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual( old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state(self.cluster) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_5_mgmt_is_isolated_with_master_failover(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for loop_cnt in range(3): master, slave1, slave2 = util.get_mss(cluster) self.assertNotEquals(master, None, 'there is no master') self.assertNotEquals(slave1, None, 'there is no slave1') self.assertNotEquals(slave2, None, 'there is no slave2') # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Shutdown master util.log( 'shutdown pgs%d while hanging.' % master['id'] ) ret = testbase.request_to_shutdown_smr( master ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % master['id'] ) ret = testbase.request_to_shutdown_redis( master ) self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % master['id'] ) # Check state F max_try = 20 expected = 'F' for i in range( 0, max_try): util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port)) state = util._get_smr_state( master['id'], cluster['cluster_name'], mgmt_ip, mgmt_port ) if expected == state: break; time.sleep( 1 ) self.assertEqual( expected , state, 'master%d - state:%s, expected:%s' % (master['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % master['id'] ) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == master['id']: continue if s['active_role'] != s['mgmt_role']: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Recovery util.log( 'restart pgs%d.' % master['id'] ) ret = testbase.request_to_start_smr( master ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % master['id'] ) ret = testbase.request_to_start_redis( master ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % master['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( master, wait_count ) self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (master['id']) ) redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis' ) ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(0, 3): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) for i in range(3, 6): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize') # Delete forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
def test_6_repeat_isolation_and_no_opinion_linepay(self): util.print_frame() util.iptables_print_list() # Add forwarding role self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.') self.assertTrue(util.iptables_redirect('A', '127.0.0.101', '127.0.0.1'), 'add a forwarding role to iptables fail.') self.assertTrue(util.iptables_redirect('A', '127.0.0.102', '127.0.0.1'), 'add a forwarding role to iptables fail.') cluster_name = 'no_opinion' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test loop_cnt = 0 while (loop_cnt < 20): loop_cnt += 1 # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) self.assertTrue(util.iptables_drop('A', '127.0.0.102'), 'add a bloking role to iptables fail.') for i in range(1): util.log('waiting... %d' % (i + 1)) time.sleep(0.1) self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.') for i in range(3): util.log('waiting... %d' % (i + 1)) time.sleep(1.2) self.assertTrue(util.iptables_drop('A', '127.0.0.101'), 'add a bloking role to iptables fail.') for i in range(1): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt) self.assertTrue(util.iptables_drop('D', '127.0.0.102'), 'delete a bloking role to iptables fail.') for i in range(0): util.log('waiting... %d' % (i + 1)) time.sleep(1) self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.') for i in range(0): util.log('waiting... %d' % (i + 1)) time.sleep(1) self.assertTrue(util.iptables_drop('D', '127.0.0.101'), 'delete a bloking role to iptables fail.') for i in range(3): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Print state of cluster util.log('\n ### STATE OF CLUSTER ### ') cluster_state = False for i in range(10): cluster_state = util.check_cluster(cluster_name, mgmt_ip, mgmt_port, initial_state, check_quorum=True) if cluster_state == True: break else: time.sleep(1) self.assertTrue(cluster_state, 'failed to check cluster state') all_in_f = True for s in cluster['servers']: if checkLastState(mgmt_ip, s['cm_port'], cluster_name, 0, 'F') == False: all_in_f = False break self.assertFalse(all_in_f, 'PGS0`s last state remains in F') # Delete forwarding role self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail.') self.assertTrue(util.iptables_redirect('D', '127.0.0.101', '127.0.0.1'), 'delete a forwarding role to iptables fail.') self.assertTrue(util.iptables_redirect('D', '127.0.0.102', '127.0.0.1'), 'delete a forwarding role to iptables fail.') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def test_2_some_pgs_is_isolated(self): util.print_frame() util.iptables_print_list() # Add forwarding role (127.0.0.100 -> 127.0.0.1) self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.') cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0] util.log(util.json_to_str(cluster)) # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Place master on virtual ip address in order to cause master election. pg_id = 0 m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id) s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id) if m.has_key('ip') == True and m.has_key('real_ip') == False: ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id']) self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id']) # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for cnt in range(3): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt) self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.') for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['pgs_id'] == 1: continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt) self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == 1: continue if is_pgs_normal(s) == False: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) # Delete forwarding role (127.0.0.100 -> 127.0.0.1) self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me' and fi[1] == 'lconn': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'qa', 'setquorum' elif fi[0] == 'qa' and fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) # shutdown ret = testbase.request_to_shutdown_smr(s1) self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id'])) ret = testbase.request_to_shutdown_redis(s1) self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id'])) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # Modify quorum ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n') self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi)) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # recovery ret = testbase.request_to_start_smr(s1) self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi)) ret = testbase.request_to_start_redis(s1, max_try=120) self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi)) ret = testbase.wait_until_finished_to_set_up_role(s1, 11) self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id'])) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi)) # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) finally: for c in clnts: c.quit() for c in clnts: c.join()
def test_2_some_pgs_is_isolated(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0] util.log(util.json_to_str(cluster)) # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Place master on virtual ip address in order to cause master election. pg_id = 0 m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id) s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id) if m.has_key('ip') == True and m.has_key('real_ip') == False: ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id']) self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id']) # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for cnt in range(3): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt) out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['pgs_id'] == 1: continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt) out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == 1: continue if s['active_role'] != s['mgmt_role']: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize') # Delete forwarding role (127.0.0.100 -> 127.0.0.1) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( target ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) r = '' expected = 'N' for fc_cnt in xrange(20): r = util.get_smr_role_of_cm(target, self.leader_cm) if r == expected: break time.sleep(0.5) self.assertEquals(r, expected, 'failure detection error.') running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps= {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) util.log('start smr-replicator done') ret = testbase.request_to_start_redis( target, 60 ) self.assertEqual( ret, 0, 'failed to start redis' ) util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Check quorum expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master): util.log('hanging_servers:%s' % hanging_servers) util.log('running_servers:%s' % running_servers) util.log('target_id:%s' % target_id) # Initial data util.put_some_data(self.cluster, 3, 10) util.log("States (before role change)") util.log_server_state(self.cluster) # Get old timestamp old_timestamps = {} for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # hang for s in hanging_servers: smr = smr_mgmt.SMR(s['id']) ret = smr.connect(s['ip'], s['smr_mgmt_port']) self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port'])) util.log("PGS '%d' hang" % s['id']) smr.write('fi delay sleep 1 13000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.') smr.disconnect() # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id) self.assertEqual(master_id, -1, 'We expected that role_change failed, but success') # Check rollback - check quorum if master not in hanging_servers: expected = 2 ok = self.__check_quorum(master, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check rollback - get new timestamp new_timestamps_in_hang = {} for s in running_servers: ts = util.get_timestamp_of_pgs( s ) new_timestamps_in_hang[s['id']] = ts # Check rollback - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps_in_hang[s['id']] self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts)) time.sleep(16) util.log("States (after role change)") util.log_server_state( self.cluster ) self.load_gen_list = {} # Start load generator for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Check quorum if master in hanging_servers: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected)) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Cheeck Consistency for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def test_1_role_change(self): util.print_frame() self.load_gen_list = {} # Start load generator util.log("Start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Loop (smr: 3 copy) for i in range(30): target_server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(target_server, None, 'Get slave fail.') target = target_server['id'] print '' util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target)) # Get old timestamp util.log_server_state( self.cluster ) old_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target + 1) % 3 util.log('Change role success.') # Wait until role change finished for s in self.cluster['servers']: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): try: pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break except: pass time.sleep(0.2) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state( self.cluster ) new_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs( s ) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(3): self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Loop (smr: 2 copy) self.__del_server(self.cluster['servers'][0]) servers = [self.cluster['servers'][1], self.cluster['servers'][2]] normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") for i in range(30): print '' util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target)) s = util.get_server_by_role(servers, 'slave') target = s['id'] # Get old timestamp util.log_server_state( self.cluster ) old_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs( s ) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target) % 2 + 1 util.log('Change role success.') # Wait until role change finished for s in servers: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break time.sleep(0.1) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state( self.cluster ) new_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs( s ) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(2): self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')
def test_6_repeat_isolation_and_no_opinion_linepay(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'no_opinion' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test loop_cnt = 0 while (loop_cnt < 20): loop_cnt += 1 # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -A OUTPUT -d 127.0.0.102 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(1): util.log('waiting... %d' % (i + 1)) time.sleep(0.1) out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(3): util.log('waiting... %d' % (i + 1)) time.sleep(1.2) out = util.sudo('iptables -A OUTPUT -d 127.0.0.101 -j DROP') self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(1): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt) out = util.sudo('iptables -D OUTPUT -d 127.0.0.102 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) for i in range(0): util.log('waiting... %d' % (i + 1)) time.sleep(1) out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) for i in range(0): util.log('waiting... %d' % (i + 1)) time.sleep(1) out = util.sudo('iptables -D OUTPUT -d 127.0.0.101 -j DROP') self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) for i in range(3): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Print state of cluster util.log('\n ### STATE OF CLUSTER ### ') cluster_state = False for i in range(10): cluster_state = util.check_cluster(cluster_name, mgmt_ip, mgmt_port, initial_state, check_quorum=True) if cluster_state == True: break else: time.sleep(1) self.assertTrue(cluster_state, 'failed to check cluster state') all_in_f = True for s in cluster['servers']: if checkLastState(mgmt_ip, s['cm_port'], cluster_name, 0, 'F') == False: all_in_f = False break self.assertFalse(all_in_f, 'PGS0`s last state remains in F') # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize') # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
def consistent_after_failover(self): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) for i in range(0, max): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') time.sleep(5) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr(server) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(5) # check state F for server in servers: state = self.get_expected_smr_state(server, 'F') self.assertEquals('F', state, 'server%d - state:%s' % (server['id'], state)) # recovery for server in servers: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % server['id']) ret = testbase.request_to_start_redis(server, False) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % server['id']) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep(5) # wait for master election for i in xrange(10): ret = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id'])) state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role)) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server(server) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave)) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis(master['id']) ret = redis.connect(master['ip'], master['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id']) for i in range(max, max * 2): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id']) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis(slave['id']) ret = slave_redis.connect(slave['ip'], slave['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id']) for i in range(0, max * 2): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write(cmd) trash = slave_redis.read_until('\r\n') res = slave_redis.read_until('\r\n') self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)) slave_redis.disconnect()
def pgs_add_and_del(self, upgrade_server, type): util.print_frame() util.log('[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type)) util.log_server_state(self.cluster) # start load generator load_gen_list = {} for i in range(len(self.cluster['servers'])): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 50): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) # attach pgs to cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret)) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) time.sleep(3) stable = False for i in xrange(20): stable = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if stable: break time.sleep(0.5) self.assertTrue(stable, 'Unstable cluster') # check new values redis = redis_mgmt.Redis(upgrade_server['id']) ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port'])) for i in range(0, 50): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write(cmd) redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i)) util.log('succeeded : check values with get operations on pgs%d.' % (upgrade_server['id'])) # shutdown load generators for i in range(len(load_gen_list)): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state(self.cluster) return 0
def test_3_some_pgs_is_isolated_2copy(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role (127.0.0.100 -> 127.0.0.1) self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.') cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1_2copy', config.clusters)[0] util.log(util.json_to_str(cluster)) # MGMT mgmt_ip = cluster['servers'][0]['ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Place master on real ip address for pg_id in [0, 1]: m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id) s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id) if m.has_key('ip') and m.has_key('real_ip'): if m['ip'] != m['real_ip']: ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id']) self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id']) # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Network isolation test for cnt in range(3): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt) self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.') for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['pgs_id'] == 0 or s['pgs_id'] == 1: continue if s['active_role'] != 'M' or s['mgmt_role'] != 'M': state_transition_done = False if s['quorum'] != 0: state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt) self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(7): final_state = [] if util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) == False: time.sleep(1) continue state_consistency = True for s in final_state: if s['pgs_id'] == 1: continue if is_pgs_normal(s) == False: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') # Delete forwarding role (127.0.0.100 -> 127.0.0.1) self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail.') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def master_and_slave_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave = smr_mgmt.SMR(s1['id']) ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_master.write('fi delay sleep 1 10000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave.write('fi delay sleep 1 10000\r\n') util.log('server state transition after hang') util.log_server_state(self.cluster) time.sleep(5) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range(15): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep(1) util.log('') util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2') util.log('') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to check copy-quorum') ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue(ok, 'Cluster state is not normal!') redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write(cmd) res = redis2.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port'])) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port'])) if len(self.cluster['servers']) != 3: # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_4_mgmt_is_isolated_with_red_failover(self): util.print_frame() util.iptables_print_list() cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port) # Master must be the first pgs, cluster['servers'][0]. to_be_master = cluster['servers'][0] m = util.get_server_by_role_and_pg(cluster['servers'], 'master', to_be_master['pg_id']) master_id = -1 if m['id'] != to_be_master['id']: try_cnt = 0 while master_id != to_be_master['id'] and try_cnt < 20: master_id = util.role_change(cluster['servers'][0], cluster['cluster_name'], to_be_master['id']) try_cnt += 1 time.sleep(1) self.assertEquals(master_id, to_be_master['id'], 'change %d to a master fail' % to_be_master['id']) # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Set SMR option (slave_idle_timeout) util.log('\n\n\n ### Set SMR option ###') for s in cluster['servers']: t = telnet.Telnet('SMR%d' % s['id']) self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Network isolation test for loop_cnt in range(3): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) for s in cluster['servers']: self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.') for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') pgs_list = util.get_pgs_info_list(mgmt_ip, mgmt_port, cluster) reds = filter(lambda x: x['color'] == 'RED', pgs_list) # Shutdown server = cluster['servers'][random.choice(reds)['pgs_id']] util.log( 'shutdown pgs%d while hanging.' % server['id'] ) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] ) # Check state F max_try = 20 expected = 'F' for i in range( 0, max_try): util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port)) state = util._get_smr_state( server['id'], cluster['cluster_name'], mgmt_ip, mgmt_port ) if expected == state: break; time.sleep( 1 ) self.assertEqual( expected , state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected) ) util.log( 'succeeded : pgs%d state changed to F.' % server['id'] ) # Unblock network for s in cluster['servers']: self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(10): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == server['id']: continue if is_pgs_normal(s) == False: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Recovery util.log( 'restart pgs%d.' % server['id'] ) ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] ) wait_count = 20 ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) redis = redis_mgmt.Redis( server['id'] ) ret = redis.connect( server['ip'], server['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis' ) ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Reset SMR option (slave_idle_timeout) t = telnet.Telnet('SMR%d' % server['id']) self.assertEqual(t.connect(server['ip'], server['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (server['ip'], server['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (server['ip'], server['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (server['ip'], server['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) if initial_state[i]['pgs_id'] == 1: self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def test_two_slaves_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs(s1) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1)) ts_before2 = util.get_timestamp_of_pgs(s2) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2)) # hang smr1 = smr_mgmt.SMR(s1['id']) ret = smr1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr2 = smr_mgmt.SMR(s2['id']) ret = smr2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr1.write('fi delay sleep 1 8000\r\n') reply = smr1.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr2.write('fi delay sleep 1 8000\r\n') time.sleep(7) success = False for i in xrange(20): ret = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port, check_quorum=True) if ret: success = True break time.sleep(1) self.assertEqual(success, True, 'unstable cluster') # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_5_mgmt_is_isolated_with_lconn(self): util.print_frame() util.iptables_print_list() cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Set SMR option (slave_idle_timeout) util.log('\n\n\n ### Set SMR option ###') for s in cluster['servers']: t = telnet.Telnet('SMR%d' % s['id']) self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Network isolation test for loop_cnt in range(3): # Get master master = util.get_server_by_role_and_pg( cluster['servers'], 'master', 0 ) first_slave = None for s in cluster['servers']: if s == master: continue # Skip non-virtual host if s.has_key('real_ip') == False: continue if first_slave == None: first_slave = s # 'role lconn' util.log( 'role lconn pgs%d while hanging.' % s['id'] ) ret = util.role_lconn_addr( s['real_ip'], s['smr_mgmt_port'] ) self.assertEqual( ret, '+OK\r\n', 'role lconn failed. reply="%s"' % (ret[:-2]) ) util.log( 'succeeded : cmd="role lconn", reply="%s"' % (ret[:-2]) ) time.sleep(0.5) # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt) self.assertTrue(util.iptables_drop('A', '127.0.0.100', first_slave['smr_mgmt_port']), 'add a bloking role to iptables fail.') for i in range(6): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['pgs_id'] != first_slave['id']: continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network self.assertTrue(util.iptables_drop('D', '127.0.0.100', first_slave['smr_mgmt_port']), 'delete a bloking role to iptables fail.') # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['pgs_id'] == 1: continue if is_pgs_normal(s) == False: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') ok = False for i in xrange(5): ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True) if ok: break else: time.sleep(1) self.assertTrue(ok, 'failed to check cluster state') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster)
def test_all_pgs_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave1 = smr_mgmt.SMR(s1['id']) ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_slave2 = smr_mgmt.SMR(s2['id']) ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) smr_master.write('fi delay sleep 1 8000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave1.write('fi delay sleep 1 8000\r\n') smr_slave2.write('fi delay sleep 1 8000\r\n') time.sleep(10) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check new values (s2) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i)) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: util.iptables_print_list() # Add forwarding role self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.') cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertIsNotNone(conf_checker, 'failed to initialize cluster') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) # Go back to initial configuration cmfi.init() for fi in cmfi: try: self.assertTrue(fi_confmaster.fi_add(fi, 0, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Wait until workflows done ret = util.await(60, True)( lambda cinfo: cinfo['wf'] == 0, lambda : util.cluster_info(mgmt_ip, mgmt_port, cluster['cluster_name'])) self.assertTrue(ret, 'There are still some workflows.') self.assertTrue(conf_checker.final_check()) # Shutdown cluster default_cluster.finalize(cluster) finally: for c in clnts: c.quit() for c in clnts: c.join() # Delete forwarding role self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')
def master_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(5) # wait for forced master election success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True break time.sleep(1) util.log('server state transition after hang') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to forced master election') redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check if the haning server recovered and joined as a slave time.sleep(7) role = util.get_role_of_server(m) self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave') redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_1_role_change(self): util.print_frame() self.load_gen_list = {} # Start load generator util.log("Start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen # Loop (smr: 3 copy) for i in range(30): target_server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(target_server, None, 'Get slave fail.') target = target_server['id'] print '' util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target)) # Get old timestamp util.log_server_state(self.cluster) old_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target + 1) % 3 util.log('Change role success.') # Wait until role change finished for s in self.cluster['servers']: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): try: pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break except: pass time.sleep(0.2) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state(self.cluster) new_timestamp_list = [] for s in self.cluster['servers']: ts = util.get_timestamp_of_pgs(s) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(3): self.assertNotEqual( old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Loop (smr: 2 copy) self.__del_server(self.cluster['servers'][0]) servers = [self.cluster['servers'][1], self.cluster['servers'][2]] normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") for i in range(30): print '' util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target)) s = util.get_server_by_role(servers, 'slave') target = s['id'] # Get old timestamp util.log_server_state(self.cluster) old_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs(s) old_timestamp_list.append(ts) # Role change master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target) self.assertNotEqual(master, -1, 'role_change error.') while target == master: target = (target) % 2 + 1 util.log('Change role success.') # Wait until role change finished for s in servers: max_try_cnt = 20 ok = False for try_cnt in range(max_try_cnt): pong = util.pingpong(s['ip'], s['redis_port']) if pong != None and pong == '+PONG\r\n': ok = True break time.sleep(0.1) self.assertTrue(ok, 'redis state error.') # Get new timestamp util.log_server_state(self.cluster) new_timestamp_list = [] for s in servers: ts = util.get_timestamp_of_pgs(s) new_timestamp_list.append(ts) # Compare old timestamps and new timestamps for i in range(2): self.assertNotEqual( old_timestamp_list[i], new_timestamp_list[i], 'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i])) # Cheeck Consistency for load_gen_id, load_gen in self.load_gen_list.items(): self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change') # Go back to initial configuration self.assertTrue( util.install_pgs(self.cluster, self.cluster['servers'][0], self.leader_cm, rm_ckpt=False), 'failed to recover pgs.')
def master_failover_while_hang(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) self.failover_while_hang(m) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(m['id']) ret = redis1.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (m['id'], s2['id'])) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_4_role_change_with_failover(self): util.print_frame() loop_cnt = 0 while loop_cnt < 5: util.log('') util.log('Loop:%d' % loop_cnt) util.log("States (before role change)") util.log_server_state(self.cluster) target = random.choice(self.cluster['servers']) # bgsave ret = util.bgsave(target) self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id']) # shutdown util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_shutdown_smr(target) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(target) self.assertEquals(ret, 0, 'failed to shutdown redis') r = '' expected = 'N' for fc_cnt in xrange(20): r = util.get_smr_role_of_cm(target, self.leader_cm) if r == expected: break time.sleep(0.5) self.assertEquals(r, expected, 'failure detection error.') running_servers = [] for s in self.cluster['servers']: if s != target: running_servers.append(s) # Get old timestamp old_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) old_timestamps[s['id']] = ts # Start load generator self.load_gen_list = {} util.log('start load generator') for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) load_gen = load_generator.LoadGenerator(i, ip, port) load_gen.start() self.load_gen_list[i] = load_gen m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') # Role change master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id']) self.assertNotEqual(master_id, -1, 'role_change failed') util.log("States (after role change)") util.log_server_state(self.cluster) # Check - get new timestamp new_timestamps = {} for s in running_servers: ts = util.get_timestamp_of_pgs(s) new_timestamps[s['id']] = ts # Check - compare old timestamps and new timestamps for s in running_servers: old_ts = old_timestamps[s['id']] new_ts = new_timestamps[s['id']] self.assertNotEqual( old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts)) # Check quorum m = self.cluster['servers'][master_id] expected = 1 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after role change). expected:%s' % (expected)) # recovery util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port'])) ret = testbase.request_to_start_smr(target) self.assertEqual(ret, 0, 'failed to start smr') util.log('start smr-replicator done') ret = testbase.request_to_start_redis(target, 60) self.assertEqual(ret, 0, 'failed to start redis') util.log('start redis-arc done') ret = testbase.wait_until_finished_to_set_up_role(target, max_try=300) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id'])) util.log("States (after recovery)") util.log_server_state(self.cluster) # Check cluster state normal_state = False for i in xrange(20): normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True) if normal_state: break time.sleep(0.5) self.assertTrue(normal_state, "Unstable cluster state") # Check quorum expected = 2 ok = self.__check_quorum(m, expected) self.assertTrue( ok, 'unexpected quorum(after recovery). expected:%s' % (expected)) # Cheeck Consistency util.log('stop load generator') for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None) loop_cnt += 1 return 0
def test_large_scale_master_election(self): util.print_frame() # initialize cluster information pgs_id = 10 cluster = { 'cluster_name': 'large_scale', 'keyspace_size': 8192, 'quorum_policy': '0:1', 'slots': [], 'pg_id_list': [], 'servers': [] } pg_max = 32 pgs_per_pg = 3 for pg_id in range(pg_max): cluster['pg_id_list'].append(pg_id) cluster['slots'].append(8192 / pg_max * pg_id) if pg_id == pg_max - 1: cluster['slots'].append(8191) else: cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1) for pgs in range(pgs_per_pg): smr_base_port = 15000 + pgs_id * 20 smr_mgmt_port = smr_base_port + 3 gateway_port = smr_base_port + 10 redis_port = smr_base_port + 9 server = {} server['id'] = pgs_id pgs_id = pgs_id + 1 server['cluster_name'] = cluster['cluster_name'] server['ip'] = self.cluster['servers'][0]['ip'] server['pm_name'] = self.cluster['servers'][0]['pm_name'] server['cm_port'] = None server['pg_id'] = pg_id server['smr_base_port'] = smr_base_port server['smr_mgmt_port'] = smr_mgmt_port server['gateway_port'] = gateway_port server['redis_port'] = redis_port server['zk_port'] = 2181 cluster['servers'].append(server) # send initialize commands to confmaster testbase.initialize_cluster(cluster, self.leader_cm) # set up pgs binaries try: for server in cluster['servers']: id = server['id'] util.log('copy binaries, server_id=%d' % id) util.copy_smrreplicator(id) util.copy_gw(id) util.copy_redis_server(id) util.copy_cluster_util(id) except IOError as e: util.log(e) util.log('Error: can not find file or read data') self.assertEqual(0, 1, 'Error: can not find file or read data') except: util.log('Error: file open error.') # cleanup servers`s directories for server in cluster['servers']: ret = testbase.cleanup_pgs_log_and_ckpt(cluster['cluster_name'], server) self.assertEqual( ret, 0, 'failed to cleanup_test_environment, id=%d' % server['id']) # start pgs for server in cluster['servers']: ret = testbase.request_to_start_smr(server) self.assertEqual( ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.request_to_start_redis(server, check=False) self.assertEqual( ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEqual(ret, 0, 'failed to role set up, id=%d' % server['id']) for i in range(4): server = cluster['servers'][i] ret = testbase.request_to_start_gateway(cluster['cluster_name'], server, self.leader_cm) self.assertEqual( ret, 0, 'failed to request_to_start_gateway, id=%d' % server['id']) clusters = cluster_ls() self.assertNotEqual(len(clusters), 0, 'There is no clsuter.') ok = True for c in clusters: if not util.check_cluster(str(c), self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True): ok = False self.assertEqual(ok, True, 'failed to initlize roles of pgs')
def test_two_slaves_hang( self ): util.print_frame() self.setup_test_cluster( self.cluster_3copy ) # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs( s1 ) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) ) ts_before2 = util.get_timestamp_of_pgs( s2 ) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) ) # hang smr1 = smr_mgmt.SMR( s1['id'] ) ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr2 = smr_mgmt.SMR( s2['id'] ) ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) ) smr1.write( 'fi delay sleep 1 8000\r\n' ) reply = smr1.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) smr2.write( 'fi delay sleep 1 8000\r\n' ) time.sleep( 7 ) # wait for rejoin as a slave success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s1 ) if ts_after != -1 and ts_before1 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) success = False for i in range( 20 ): role = util.get_role_of_server( s2 ) if role == c.ROLE_SLAVE: ts_after = util.get_timestamp_of_pgs( s2 ) if ts_after != -1 and ts_before2 == ts_after: success = True break time.sleep( 1 ) self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def pgs_add_and_del( self, upgrade_server, type ): util.print_frame() util.log( '[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type) ) util.log_server_state( self.cluster ) # start load generator load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway( '0' ) gw.connect( ip, port ) for i in range( 0, 50 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) ) # attach pgs to cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) time.sleep( 3 ) stable = False for i in xrange(20): stable = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if stable: break; time.sleep(0.5) self.assertTrue(stable, 'Unstable cluster') # check new values redis = redis_mgmt.Redis( upgrade_server['id'] ) ret = redis.connect( upgrade_server['ip'], upgrade_server['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port']) ) for i in range( 0, 50 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write( cmd ) redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i) ) util.log( 'succeeded : check values with get operations on pgs%d.' % (upgrade_server['id']) ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state( self.cluster ) return 0
def master_hang( self ): # get gateway info ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) ) # set values for i in range( 0, 10000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) ) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) else: m, s1 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) util.log( 'server state before hang' ) util.log_server_state( self.cluster ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 10000\r\n' ) reply = smr.read_until( '\r\n', 1 ) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' ) time.sleep( 5 ) # wait for forced master election success = False for i in range( 20 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True break time.sleep( 1 ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) self.assertEqual( success, True, 'failed to forced master election' ) redis1 = redis_mgmt.Redis( s1['id'] ) ret = redis1.connect( s1['ip'], s1['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) ) # set new values for i in range( 10000, 20000 ): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write( cmd ) res = redis1.read_until( '\r\n' ) self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) ) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis( s2['id'] ) ret = redis2.connect( s2['ip'], s2['redis_port'] ) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write( cmd ) redis2.read_until( '\r\n' ) res = redis2.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) ) # check if the haning server recovered and joined as a slave time.sleep( 7 ) role = util.get_role_of_server( m ) self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' ) redis0 = redis_mgmt.Redis( m['id'] ) ret = redis0.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) ) util.log( 'server state transition after hang' ) util.log_server_state( self.cluster ) # check new values for i in range( 10000, 20000 ): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write( cmd ) redis0.read_until( '\r\n' ) res = redis0.read_until( '\r\n' ) self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) ) # check consistency self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def consistent_after_failover( self ): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( ip ) gw.connect( ip, port ) for i in range( 0, max ): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) time.sleep( 5 ) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] ) ret = testbase.request_to_shutdown_redis( server ) self.assertEquals( ret, 0, 'failed to shutdown redis' ) time.sleep( 5 ) # check state F for server in servers: state = self.get_expected_smr_state( server, 'F' ) self.assertEquals( 'F', state, 'server%d - state:%s' % (server['id'], state) ) # recovery for server in servers: ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] ) ret = testbase.request_to_start_redis( server, False ) self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id'] ) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep( 5 ) # wait for master election for i in xrange(10): ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] ) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count ) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) ) state = self.get_expected_smr_state( server, 'N' ) role = util.get_role_of_server( server ) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role) ) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server( server ) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave) ) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis( master['id'] ) ret = redis.connect( master['ip'], master['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] ) for i in range( max, max*2 ): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write( cmd ) res = redis.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id'] ) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis( slave['id'] ) ret = slave_redis .connect( slave['ip'], slave['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] ) for i in range( 0, max*2 ): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write( cmd ) trash = slave_redis.read_until( '\r\n' ) res = slave_redis.read_until( '\r\n' ) self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res) ) slave_redis.disconnect()
def test_large_scale_master_election( self ): util.print_frame() # initialize cluster information pgs_id = 10 cluster = { 'cluster_name' : 'large_scale', 'keyspace_size' : 8192, 'quorum_policy' : '0:1', 'slots' : [], 'pg_id_list' : [], 'servers' : [] } pg_max = 32 pgs_per_pg = 3 for pg_id in range(pg_max): cluster['pg_id_list'].append(pg_id) cluster['slots'].append(8192 / pg_max * pg_id) if pg_id == pg_max - 1: cluster['slots'].append(8191) else: cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1) for pgs in range(pgs_per_pg): smr_base_port = 15000 + pgs_id * 20 smr_mgmt_port = smr_base_port + 3 gateway_port = smr_base_port + 10 redis_port = smr_base_port + 9 server = {} server['id'] = pgs_id pgs_id = pgs_id + 1 server['cluster_name'] = cluster['cluster_name'] server['ip'] = self.cluster['servers'][0]['ip'] server['pm_name'] = self.cluster['servers'][0]['pm_name'] server['cm_port'] = None server['pg_id'] = pg_id server['smr_base_port'] = smr_base_port server['smr_mgmt_port'] = smr_mgmt_port server['gateway_port'] = gateway_port server['redis_port'] = redis_port server['zk_port'] = 2181 cluster['servers'].append(server) # send initialize commands to confmaster testbase.initialize_cluster(cluster, self.leader_cm) # set up pgs binaries try: for server in cluster['servers']: id = server['id'] util.log('copy binaries, server_id=%d' % id) util.copy_smrreplicator( id ) util.copy_gw( id ) util.copy_redis_server( id ) util.copy_cluster_util( id ) except IOError as e: util.log(e) util.log('Error: can not find file or read data') self.assertEqual(0, 1, 'Error: can not find file or read data') except: util.log('Error: file open error.') # cleanup servers`s directories for server in cluster['servers']: ret = testbase.cleanup_pgs_log_and_ckpt( cluster['cluster_name'], server ) self.assertEqual(ret, 0, 'failed to cleanup_test_environment, id=%d' % server['id']) # start pgs for server in cluster['servers']: ret = testbase.request_to_start_smr( server ) self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.request_to_start_redis( server, check=False ) self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id']) for server in cluster['servers']: ret = testbase.wait_until_finished_to_set_up_role(server) self.assertEqual(ret, 0, 'failed to role set up, id=%d' % server['id']) for i in range(4): server = cluster['servers'][i] ret = testbase.request_to_start_gateway( cluster['cluster_name'], server, self.leader_cm ) self.assertEqual(ret, 0, 'failed to request_to_start_gateway, id=%d' % server['id']) clusters = cluster_ls() self.assertNotEqual(len(clusters), 0, 'There is no clsuter.') ok = True for c in clusters: if not util.check_cluster(str(c), self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True): ok = False self.assertEqual(ok, True, 'failed to initlize roles of pgs')
def test_1_mgmt_is_isolated(self): util.print_frame() out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0] util.log(util.json_to_str(cluster)) # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster ) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Set SMR option (slave_idle_timeout) util.log('\n\n\n ### Set SMR option ###') for s in cluster['servers']: t = telnet.Telnet('SMR%d' % s['id']) self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0, 'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port'])) cmd = 'confset slave_idle_timeout_msec 18000' util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd)) t.write('confset slave_idle_timeout_msec 18000\r\n') reply = t.read_until('\r\n').strip() util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply)) self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply) # Network isolation test for cnt in range(5): # Block network util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt) for s in cluster['servers']: out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -p tcp --dport %d -j DROP' % (s['smr_mgmt_port'])) """Loopback Address Range (Reference : RFC3330) 127.0.0.0/8 - This block is assigned for use as the Internet host loopback address. A datagram sent by a higher level protocol to an address anywhere within this block should loop back inside the host. This is ordinarily implemented using only 127.0.0.1/32 for loopback, but no addresses within this block should ever appear on any network anywhere [RFC1700, page 5]. """ self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out) for i in range(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in range(7): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) time.sleep(1) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done : ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt) for s in cluster['servers']: out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -p tcp --dport %d -j DROP' % (s['smr_mgmt_port'])) self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out) # Check cluster state ok = False for i in range(7): final_state = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) state_consistency = True for s in final_state: if s['active_role'] != s['mgmt_role']: state_consistency = False if state_consistency: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state consistency') # Check state self.assertNotEqual(initial_state, None, 'initial_state is None') self.assertNotEqual(final_state, None, 'final_state is None') initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id'])) final_state = sorted(final_state, key=lambda x: int(x['pgs_id'])) for i in range(len(final_state)): msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts']) util.log(msg) self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')