def test_quorum_policy_of_hanging_master( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 15000\r\n' ) time.sleep( 5 ) # wait for forced master election success = False new_master = None for i in range( 7 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to forced master election' ) # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm( server['id'] ) # wait until hanging master wake up time.sleep( 5 ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( 2, quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' %(2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # Go back to initial configuration # Recover Confmaster self.assertTrue(util.recover_confmaster(self.cluster, [0,1,2], 0), 'failed to recover confmaster') return 0
def test_quorum_policy_of_hanging_master( self ): util.print_frame() # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # hang smr = smr_mgmt.SMR( m['id'] ) ret = smr.connect( m['ip'], m['smr_mgmt_port'] ) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) ) smr.write( 'fi delay sleep 1 15000\r\n' ) time.sleep( 5 ) # wait for forced master election success = False new_master = None for i in range( 7 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to forced master election' ) # shutdown confmaster for server in self.cluster['servers']: util.shutdown_cm( server['id'] ) # wait until hanging master wake up time.sleep( 5 ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( self.quorum_policy[1], quorum_of_haning_master, 'invalid quorum of haning master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) ) util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( self.quorum_policy[1], quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) return 0
def __check_quorum(self, m, expected): time.sleep(1) for try_cnt in range(10): quorum = util.get_quorum(m) if quorum == expected: util.log('quorum: %d, master: %d, try_cnt: %d, OK' % (quorum, m['id'], try_cnt)) return True else: util.log('quorum: %d, master: %d, try_cnt: %d' % (quorum, m['id'], try_cnt)) time.sleep(0.5) return False
def test_quorum( self ): util.print_frame() master, slave1, slave2 = util.get_mss(self.cluster) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave1 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave2 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] ) time.sleep( 1 ) expected = 0 max_try = 20 for i in range( 0, max_try ): master = util.get_server_by_role( self.cluster['servers'], 'master' ) quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave1 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave1 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave1 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) ) time.sleep( 1 ) expected = 1 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave2 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave2 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave2 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) ) time.sleep( 1 ) expected = 2 max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) )
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # check if pgs is removed r = util.get_role_of_server(m) if r != c.ROLE_MASTER: success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'pgs is removed' ) # check states of all pgs in pg for i in xrange(10): for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) if real_role != cc_role: time.sleep(0.5) continue for s in self.cluster['servers']: smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual(2, quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( 1, quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() # Go back to initial configuration self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']), 'failed to recover pgs, (pgs_join)') return 0
def test_quorum( self ): util.print_frame() master, slave1, slave2 = self.get_mss() expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave1 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) ret = testbase.request_to_shutdown_smr( slave2 ) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] ) time.sleep( 1 ) expected = self.quorum_policy[0] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave1 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave1 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave1 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) ) # recovery ret = testbase.request_to_start_smr( slave2 ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( slave2 ) self.assertEqual( ret, 0, 'failed to start redis' ) ret = testbase.wait_until_finished_to_set_up_role( slave2 ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) ) time.sleep( 1 ) expected = self.quorum_policy[1] max_try = 20 for i in range( 0, max_try ): quorum = util.get_quorum( master ) if quorum == expected: break; time.sleep( 1 ) self.assertEquals( quorum, expected, 'quorum:%d, expected:%d' % (quorum, expected) )
def test_quorum_with_left_pgs( self ): util.print_frame() # start load generators load_gen_list = {} for i in range( len(self.cluster['servers']) ): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # get master, slave1, slave2 m, s1, s2 = util.get_mss( self.cluster ) self.assertNotEqual( m, None, 'master is None.' ) self.assertNotEqual( s1, None, 'slave1 is None.' ) self.assertNotEqual( s2, None, 'slave2 is None.' ) # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (m['cluster_name'], m['id']) ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd ) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # check if pgs is removed success = False for try_cnt in range( 10 ): redis = redis_mgmt.Redis( m['id'] ) ret = redis.connect( m['ip'], m['redis_port'] ) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) redis.write( 'info stats\r\n' ) for i in range( 6 ): redis.read_until( '\r\n' ) res = redis.read_until( '\r\n' ) self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) ) util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) ) no = int( res.split(':')[1] ) if no <= 100: success = True break time.sleep( 1 ) self.assertEquals( success, True, 'failed : pgs does not removed.' ) util.log( 'succeeded : pgs is removed' ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_haning_master = util.get_quorum( m ) self.assertEqual( self.quorum_policy[1], quorum_of_haning_master, 'invalid quorum of left master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) ) util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master ) # 'role lconn' to master cmd = 'role lconn\r\n' ret = util.cmd_to_smr( m, cmd ) self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) ) # wait for master election success = False new_master = None for i in range( 10 ): role = util.get_role_of_server( s1 ) if role == c.ROLE_MASTER: success = True new_master = s1 break role = util.get_role_of_server( s2 ) if role == c.ROLE_MASTER: success = True new_master = s2 break time.sleep( 1 ) self.assertEqual( success, True, 'failed to elect new master' ) util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] ) time.sleep( 1 ) # check the numbers of master, slave, and lconn cnt_master = 0 cnt_slave = 0 cnt_lconn = 0 for s in self.cluster['servers']: role = util.get_role_of_server( s ) if role == c.ROLE_MASTER: cnt_master = cnt_master + 1 elif role == c.ROLE_SLAVE: cnt_slave = cnt_slave + 1 elif role == c.ROLE_LCONN: cnt_lconn = cnt_lconn + 1 self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master ) self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave ) self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn ) # check states of all pgs in pg for s in self.cluster['servers']: real_role = util.get_role_of_server( s ) real_role = util.roleNumberToChar( real_role ) smr_info = util.get_smr_info( s, self.leader_cm ) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb == 'N': continue self.assertEqual( real_role, cc_role, 'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) ) util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) ) # check quorum policy quorum_of_new_master = util.get_quorum( new_master ) self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' ) self.assertEqual( self.quorum_policy[1], quorum_of_new_master , 'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) ) util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master ) # shutdown load generators for i in range( len(load_gen_list) ): load_gen_list[i].quit() load_gen_list[i].join() return 0
def test_7_dirty_network_fi(self): util.print_frame() clnts = [] try: out = util.sudo('iptables -L') util.log('====================================================================') util.log('out : %s' % out) util.log('out.return_code : %d' % out.return_code) util.log('out.stderr : %s' % out.stderr) util.log('out.succeeded : %s' % out.succeeded) # Add forwarding role out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out) cluster_name = 'network_isolation_cluster_1' cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0] util.log(util.json_to_str(cluster)) self.leader_cm = cluster['servers'][0] # MGMT mgmt_ip = cluster['servers'][0]['real_ip'] mgmt_port = cluster['servers'][0]['cm_port'] # Create cluster ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, conf={'cm_context':'applicationContext-fi.xml'}) self.assertEqual(0, ret, 'failed to TestMaintenance.initialize') # Print initial state of cluster util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ') initial_state = [] self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state') # Start crc16 client for s in cluster['servers']: c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False) c.start() clnts.append(c) # Network isolation test cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1) for fi in cmfi: # Block network util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi)) ret = block_network(cluster, mgmt_ip, mgmt_port) self.assertTrue(ret, '[%s] failed to block network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) state_transition_done = True for s in isolated_states: if s['ip'] != '127.0.0.100': continue if s['active_role'] != '?' or s['mgmt_role'] != 'N': state_transition_done = False if state_transition_done: ok = True break time.sleep(1) self.assertTrue(ok, 'Fail, state transition') # Fault injection try: self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), "Confmaster command fail. fi: %s" % str(fi)) except ValueError as e: self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply)) # Unblock network util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi)) ret = unblock_network(cluster, mgmt_ip, mgmt_port, None) self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi)) for i in xrange(4): util.log('waiting... %d' % (i + 1)) time.sleep(1) # Check cluster state ok = False for i in xrange(10): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) check_cluster = False # 'bj', 'slave' if fi[0] == 'bj' and fi[1] == 'slave': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(s1) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'me', 'lconn' elif fi[0] == 'me' and fi[1] == 'lconn': m, s1, s2 = util.get_mss(cluster) ret = util.role_lconn(m) self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi)) check_cluster = True # 'qa', 'setquorum' elif fi[0] == 'qa' and fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) # shutdown ret = testbase.request_to_shutdown_smr(s1) self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id'])) ret = testbase.request_to_shutdown_redis(s1) self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id'])) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # Modify quorum ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n') self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi)) # Check quorum q = -1 for q_cnt in xrange(20): q = util.get_quorum(m) if q == 1: break time.sleep(1) self.assertEquals(1, q, "[%s] check quorum fail." % str(fi)) # recovery ret = testbase.request_to_start_smr(s1) self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi)) ret = testbase.request_to_start_redis(s1, max_try=120) self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi)) ret = testbase.wait_until_finished_to_set_up_role(s1, 11) self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id'])) check_cluster = True # 'setquorum' elif fi[1] == 'setquorum': m, s1, s2 = util.get_mss(cluster) ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20) self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret)) check_cluster = True if check_cluster: # Check cluster state ok = False for i in xrange(20): isolated_states = [] ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True) if ok: break time.sleep(1) self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi)) # Check fault injection ok = False for i in xrange(10): count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port) if count == 0: ok = True break time.sleep(0.5) self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi)) # Shutdown cluster ret = default_cluster.finalize( cluster ) self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi)) # Delete forwarding role out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1') self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out) for c in clnts: self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi)) finally: for c in clnts: c.quit() for c in clnts: c.join()