Example #1
0
    def test_all_pgs_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave1 = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )
        smr_slave2 = smr_mgmt.SMR( s2['id'] )
        ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        smr_master.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave1.write( 'fi delay sleep 1 8000\r\n' )
        smr_slave2.write( 'fi delay sleep 1 8000\r\n' )

        time.sleep( 10 )

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # set values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0 .write( cmd )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check new values (s2)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) )

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Example #2
0
    def failover_while_hang( self, server ):
        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs( server )
        self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before) )

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port']))
        smr = smr_mgmt.SMR( server['id'] )
        ret = smr.connect( server['ip'], server['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 4 )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

        # shutdown
        util.log( 'shutdown pgs%d while hanging.' % server['id'] )
        ret = testbase.request_to_shutdown_smr( server )
        self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] )
        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

        # recovery
        util.log( 'restart pgs%d.' % server['id'] )
        ret = testbase.request_to_start_smr( server )
        self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] )

        ret = testbase.request_to_start_redis( server )
        self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] )

        wait_count = 20
        ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to N.' % server['id'] )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( server )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( server )
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave' )
        util.log( 'succeeded : pgs%d joined as a slave.' % server['id'] )

        return 0
Example #3
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr(target)
            self.assertEqual(ret, 0, 'failed to shutdown smr')

            ret = testbase.request_to_shutdown_redis(target)
            self.assertEquals(ret, 0, 'failed to shutdown redis')

            r = ''
            expected = 'N'
            for fc_cnt in xrange(20):
                r = util.get_smr_role_of_cm(target, self.leader_cm)
                if r == expected:
                    break
                time.sleep(0.5)
            self.assertEquals(r, expected, 'failure detection error.')

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm,
                                         self.cluster['cluster_name'],
                                         s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(
                    old_ts, new_ts,
                    'Timestamp of a running server has not changed. %d->%d' %
                    (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok, 'unexpected quorum(after role change). expected:%s' %
                (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr(target)
            self.assertEqual(ret, 0, 'failed to start smr')
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis(target, 60)
            self.assertEqual(ret, 0, 'failed to start redis')
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role(target,
                                                              max_try=300)
            self.assertEquals(
                ret, 0, 'failed to role change. smr_id:%d' % (target['id']))

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check cluster state
            normal_state = False
            for i in xrange(20):
                normal_state = util.check_cluster(self.cluster['cluster_name'],
                                                  self.leader_cm['ip'],
                                                  self.leader_cm['cm_port'],
                                                  check_quorum=True)
                if normal_state:
                    break
                time.sleep(0.5)
            self.assertTrue(normal_state, "Unstable cluster state")

            # Check quorum
            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok,
                'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(),
                                'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Example #4
0
    def test_two_slaves_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs( s1 )
        self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) )

        ts_before2 = util.get_timestamp_of_pgs( s2 )
        self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) )

        # hang
        smr1 = smr_mgmt.SMR( s1['id'] )
        ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr2 = smr_mgmt.SMR( s2['id'] )
        ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr1.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr1.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr2.write( 'fi delay sleep 1 8000\r\n' )
        time.sleep( 7 )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s1 )
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s2 )
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Example #5
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        for i in range(30):
            target_server = util.get_server_by_role(self.cluster['servers'],
                                                    'slave')
            self.assertNotEquals(target_server, None, 'Get slave fail.')
            target = target_server['id']

            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    try:
                        pong = util.pingpong(s['ip'], s['redis_port'])
                        if pong != None and pong == '+PONG\r\n':
                            ok = True
                            break
                    except:
                        pass
                    time.sleep(0.2)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]

        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            s = util.get_server_by_role(servers, 'slave')
            target = s['id']

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Go back to initial configuration
        self.assertTrue(
            util.install_pgs(self.cluster,
                             self.cluster['servers'][0],
                             self.leader_cm,
                             rm_ckpt=False), 'failed to recover pgs.')
Example #6
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers,
                                     target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(
                    0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm,
                                     self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1,
                         'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 2
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs(s)
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(
                old_ts, new_ts,
                'Timestamp of a running server has changed. %d->%d' %
                (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state(self.cluster)

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check cluster state
        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(),
                            'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Example #7
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1, 'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 1
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state( self.cluster )

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Get new timestamp
        new_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps[s['id']] = ts

        # Compare old timestamps and new timestamps
        for s in self.cluster['servers']:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps[s['id']]
            if master in hanging_servers and len(running_servers) != 0:
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts))
            else:
                self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Example #8
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr( target )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

            ret = testbase.request_to_shutdown_redis( target )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps= {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr( target )
            self.assertEqual( ret, 0, 'failed to start smr' )
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis( target, 60 )
            self.assertEqual( ret, 0, 'failed to start redis' )
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300)
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) )

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check quorum
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Example #9
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        target_server = util.get_server_by_role(self.cluster['servers'], 'slave')
        self.assertNotEquals(target_server, None, 'Get slave fail.')
        target = target_server['id']
        for i in range(30):
            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]
        s = util.get_server_by_role(servers, 'slave')
        target = s['id']
        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')
    def failover_while_hang(self, server):
        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs(server)
        self.assertNotEqual(
            ts_before, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (server['id'], ts_before))

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' %
                 (server['id'], server['ip'], server['smr_mgmt_port']))
        smr = smr_mgmt.SMR(server['id'])
        ret = smr.connect(server['ip'], server['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (server['ip'], server['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(4)

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # shutdown
        util.log('shutdown pgs%d while hanging.' % server['id'])
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEqual(ret, 0,
                         'failed to shutdown smr. id:%d' % server['id'])
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0,
                          'failed to shutdown redis. id:%d' % server['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # recovery
        util.log('restart pgs%d.' % server['id'])
        ret = testbase.request_to_start_smr(server)
        self.assertEqual(ret, 0, 'failed to start smr. id:%d' % server['id'])

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis. id:%d' % server['id'])

        wait_count = 20
        ret = testbase.wait_until_finished_to_set_up_role(server, wait_count)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to N.' % server['id'])

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(server)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(server)
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to rejoin as a slave')
        util.log('succeeded : pgs%d joined as a slave.' % server['id'])

        return 0
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        success = False
        for i in xrange(20):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.mgmt_ip,
                                     self.mgmt_port,
                                     check_quorum=True)
            if ret:
                success = True
                break
            time.sleep(1)
        self.assertEqual(success, True, 'unstable cluster')

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
    def slave_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before))

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' %
                 (s1['id'], s1['ip'], s1['smr_mgmt_port']))
        smr = smr_mgmt.SMR(s1['id'])
        ret = smr.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr.write('fi delay sleep 1 6000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')
        time.sleep(7)

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(s1)
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to rejoin as a slave')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check new values
        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0