def test_quorum_policy_of_hanging_master( self ):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 15000\r\n' )
        time.sleep( 5 )

        # wait for forced master election
        success = False
        new_master = None
        for i in range( 7 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to forced master election' )

        # shutdown confmaster
        for server in self.cluster['servers']:
            util.shutdown_cm( server['id'] )

        # wait until hanging master wake up
        time.sleep( 5 )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual( 2, quorum_of_haning_master,
                          'invalid quorum of haning master, expected:%d, but:%d' %(2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( 1, quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        # Go back to initial configuration
        # Recover Confmaster
        self.assertTrue(util.recover_confmaster(self.cluster, [0,1,2], 0), 'failed to recover confmaster')

        return 0
    def test_quorum_policy_of_hanging_master( self ):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 15000\r\n' )
        time.sleep( 5 )

        # wait for forced master election
        success = False
        new_master = None
        for i in range( 7 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to forced master election' )

        # shutdown confmaster
        for server in self.cluster['servers']:
            util.shutdown_cm( server['id'] )

        # wait until hanging master wake up
        time.sleep( 5 )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual( self.quorum_policy[1], quorum_of_haning_master,
                          'invalid quorum of haning master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) )
        util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( self.quorum_policy[1], quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        return 0
Exemple #3
0
    def failover(self, server):
        # shutdown
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEqual(ret, 0, 'failed to shutdown smr')
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0, 'failed to shutdown redis')

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))

        # recovery
        ret = testbase.request_to_start_smr(server)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server, 10)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s, role:%s' %
            (server['id'], state, expected, role))
    def failover( self, server ):
        # shutdown
        ret = testbase.request_to_shutdown_smr( server )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )

        # recovery
        ret = testbase.request_to_start_smr( server )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server, 10 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )
Exemple #5
0
    def test_upgrade_smr_repeatedly(self):
        util.print_frame()

        execution_count_master = 0
        execution_count_slave = 0
        old_target = None
        for cnt in range(5):
            target = random.choice(self.cluster['servers'])
            while target == old_target:
                target = random.choice(self.cluster['servers'])
            old_target = target

            role = util.get_role_of_server(target)
            if role == c.ROLE_SLAVE:
                ret = util.upgrade_pgs(target, self.leader_cm, self.cluster)
                self.assertTrue(ret,
                                'Failed to upgrade slave pgs%d' % target['id'])

                execution_count_master = execution_count_master + 1
            elif role == c.ROLE_MASTER:
                ret = util.upgrade_pgs(target, self.leader_cm, self.cluster)
                self.assertTrue(
                    ret, 'Failed to upgrade master pgs%d' % target['id'])
                execution_count_slave = execution_count_slave + 1
            else:
                self.fail('unexpected role:%s' % role)
            time.sleep(1)

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            if execution_count_master == 0:
                ret = util.upgrade_pgs(m, self.leader_cm, self.cluster)
                self.assertTrue(ret,
                                'Failed to upgrade master pgs%d' % m['id'])
            if execution_count_slave == 0:
                ret = util.upgrade_pgs(s2, self.leader_cm, self.cluster)
                self.assertTrue(ret,
                                'Failed to upgrade slave pgs%d' % s2['id'])
Exemple #6
0
    def test_pgs_add_and_del_repeatedly(self):
        util.print_frame()

        execution_count_master = 0
        execution_count_slave = 0
        old_target = None
        for cnt in range(50):
            target = random.choice(self.cluster['servers'])
            while target == old_target:
                target = random.choice(self.cluster['servers'])
            old_target = target

            role = util.get_role_of_server(target)
            if role == c.ROLE_SLAVE:
                self.pgs_add_and_del(target, 'slave')
                execution_count_master = execution_count_master + 1
            elif role == c.ROLE_MASTER:
                self.pgs_add_and_del(target, 'master')
                execution_count_slave = execution_count_slave + 1
            else:
                self.fail('unexpected role:%s' % role)
Exemple #7
0
    def test_pgs_add_and_del_repeatedly( self ):
       util.print_frame()

       execution_count_master = 0
       execution_count_slave = 0
       old_target = None
       for cnt in range( 50 ):
            target = random.choice( self.cluster['servers'] )
            while target == old_target:
                target = random.choice( self.cluster['servers'] )
            old_target = target

            role = util.get_role_of_server( target )
            if role == c.ROLE_SLAVE:
                self.pgs_add_and_del( target, 'slave' )
                execution_count_master = execution_count_master + 1
            elif role == c.ROLE_MASTER:
                self.pgs_add_and_del( target, 'master' )
                execution_count_slave = execution_count_slave + 1
            else:
                self.fail( 'unexpected role:%s' % role )
Exemple #8
0
    def test_upgrade_smr_repeatedly( self ):
        util.print_frame()

        execution_count_master = 0
        execution_count_slave = 0
        old_target = None
        for cnt in range( 5 ):
            target = random.choice( self.cluster['servers'] )
            while target == old_target:
                target = random.choice( self.cluster['servers'] )
            old_target = target

            role = util.get_role_of_server( target )
            if role == c.ROLE_SLAVE:
                ret = util.upgrade_pgs( target, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % target['id'])

                execution_count_master = execution_count_master + 1
            elif role == c.ROLE_MASTER:
                ret = util.upgrade_pgs( target, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade master pgs%d' % target['id'])
                execution_count_slave = execution_count_slave + 1
            else:
                self.fail( 'unexpected role:%s' % role )
            time.sleep( 1 )

            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )

            if execution_count_master == 0:
                ret = util.upgrade_pgs( m, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade master pgs%d' % m['id'])
            if execution_count_slave == 0:
                ret = util.upgrade_pgs( s2, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s2['id'])
    def test_all_pgs_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave1 = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )
        smr_slave2 = smr_mgmt.SMR( s2['id'] )
        ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        smr_master.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave1.write( 'fi delay sleep 1 8000\r\n' )
        smr_slave2.write( 'fi delay sleep 1 8000\r\n' )

        time.sleep( 10 )

        # wait for forced master election
        success = False
        master = None
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            ts = util.get_timestamp_of_pgs( s1 )
            if role == c.ROLE_MASTER and ts == s1_ts:
                master = s1
                success = True
                break
            role = util.get_role_of_server( s2 )
            ts = util.get_timestamp_of_pgs( s2 )
            if role == c.ROLE_MASTER and ts == s2_ts:
                master = s2
                success = True
                break
            role = util.get_role_of_server( m )
            ts = util.get_timestamp_of_pgs( m )
            if role == c.ROLE_MASTER and ts == m_ts:
                master = m
                success = True
                break
            time.sleep( 1 )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        self.assertEqual( success, True, 'failed to forced master election' )

        servers = [m, s1, s2]
        for s in servers:
            if s != master:
                for i in range( 20 ):
                    role = util.get_role_of_server( s )
                    if role == c.ROLE_SLAVE:
                        success = True
                        break
                    time.sleep( 1 )
                self.assertEqual( success, True, 'failed to rejoin as a slave, %s:%d' % (s['ip'], s['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # set values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0 .write( cmd )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check new values (s2)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) )

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
    def state_transition( self ):
        server = util.get_server_by_role( self.cluster['servers'], 'slave' )
        self.assertNotEquals( server, None, 'failed to get_server_by_role-slave' )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )

        # check initial state
        state = self.get_expected_smr_state( server, 'N' )
        role = util.get_role_of_server( server )
        self.assertEquals( 'N', state,
                           'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role) )

        # shutdown
        ret = testbase.request_to_shutdown_smr( server )
        self.assertEquals( ret, 0, 'failed to shutdown smr' )
        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        time.sleep( 3 )


        # check state F
        expected = 'F'
        state = self.get_expected_smr_state( server, expected )
        self.assertEquals( expected , state,
                           'server%d - state:%s, but expected:%s' % (server['id'], state, expected) )

        # set value
        ret = gw.connect( ip, port )
        self.assertEquals( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )
        timestamp  = 0.0
        for i in range( 0, 100 ):
            timestamp = time.time()
            key = 'new_key_haha'
            cmd = 'set %s %f\r\n' % (key, timestamp)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        gw.disconnect()

        # recovery
        ret = testbase.request_to_start_smr( server )
        self.assertEquals( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server )
        self.assertEquals( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server, 10 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )
        time.sleep( 5 )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        expected = 'N'
        max_try = 20
        for i in range( 0, max_try ):
            state = self.get_expected_smr_state( server, expected )
            if state == expected:
                break
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected) )
    def elect_master_randomly( self ):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway( '0' )
        gw.connect( ip, port )
        for i in range( 0, 1000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) )

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append( server['id'] )

        for try_cnt in range( 30 ):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
            util.log( 'master id : %d' % m['id'] )

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove( m['id'] )

            smr = smr_mgmt.SMR( m['id'] )
            ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
            self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
            cmd = 'role lconn\r\n'
            smr.write( cmd )
            reply = smr.read_until( '\r\n' )
            self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )
            util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )

            # wait until role-change is finished
            for role_change_try_cnt in range( 5 ):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server( server )
                    real_role = util.roleNumberToChar( real_role )
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break;
                time.sleep( 1 )

            # check the number of master and slave
            self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            util.log( 'succeeded : the number of master is 1 and the number of slave is 2' )

            # check states of all pgs in pg
            for try_cnt in range( 3 ):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server( s )
                    real_role = util.roleNumberToChar( real_role )
                    smr_info = util.get_smr_info( s, self.leader_cm )
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )
                    else:
                        util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )

                if ok == False:
                    time.sleep( 0.5 )
                else:
                    break

            self.assertTrue( ok, 'failed : role check' )

            if len( server_ids ) == 0:
                util.log( 'succeeded : all smrs have been as a master' )
                return 0

        self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))  )
        return 0
    def consistent_after_failover( self ):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( ip )
        gw.connect( ip, port )

        for i in range( 0, max ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        time.sleep( 5 )

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] )
            ret = testbase.request_to_shutdown_redis( server )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
        time.sleep( 5 )

        # check state F
        for server in servers:
            state = self.get_expected_smr_state( server, 'F' )
            self.assertEquals( 'F', state,
                               'server%d - state:%s' % (server['id'], state) )

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] )

            ret = testbase.request_to_start_redis( server, False )
            self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id']  )

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep( 5 )

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] )
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
            self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) )

            state = self.get_expected_smr_state( server, 'N' )
            role = util.get_role_of_server( server )
            self.assertEquals( 'N', state,
                               'server%d - state:%s, role:%s' % (server['id'], state, role) )

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server( server )
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave,
                           'failed to set roles, the number of master:%d, the number of slave:%d' %
                           (the_number_of_master, the_number_of_slave) )

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis( master['id'] )
        ret = redis.connect( master['ip'], master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] )

        for i in range( max, max*2 ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n',
                               'failed to get response, server:%d' % master['id'] )
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis( slave['id'] )
            ret = slave_redis .connect( slave['ip'], slave['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] )

            for i in range( 0, max*2 ):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write( cmd )
                trash = slave_redis.read_until( '\r\n' )
                res = slave_redis.read_until( '\r\n' )
                self.assertEquals( res, '%d\r\n' % i,
                                   'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)  )
            slave_redis.disconnect()
Exemple #13
0
    def test_4_PGS_mgen_is_less_than_PG_mgen(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # initial data
        util.put_some_data(self.cluster)

        # shutdown
        server_to_join = util.get_server_by_role(self.cluster['servers'],
                                                 'master')
        ret = testbase.request_to_shutdown_smr(server_to_join)
        self.assertEqual(ret, 0, 'failed to shutdown smr')
        ret = testbase.request_to_shutdown_redis(server_to_join)
        self.assertEquals(ret, 0, 'failed to shutdown redis')

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server_to_join, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server_to_join['id'], state, expected))

        # set value
        key_base = 'mw'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')

        # master failover 1 (master generation + 1)
        util.log('master failover 1')
        server = util.get_server_by_role(self.cluster['servers'], 'master')
        self.failover(server)

        # check quorum (copy:3, quorum:1, available:2)
        ok = False
        for i in xrange(10):
            ok = util.check_quorum(self.cluster['cluster_name'],
                                   self.leader_cm['ip'],
                                   self.leader_cm['cm_port'])
            if ok:
                break
            else:
                time.sleep(1)
        self.assertTrue(ok, 'Check quorum fail.')

        # master failover 2 (master generation + 1)
        util.log('master failover 2')
        server = util.get_server_by_role(self.cluster['servers'], 'master')
        self.failover(server)

        # recovery
        util.log('master recovery start.')
        ret = testbase.request_to_start_smr(server_to_join)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server_to_join)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server_to_join, 10)
        self.assertEquals(
            ret, 0,
            'failed to role change. smr_id:%d' % (server_to_join['id']))
        util.log('master recovery end successfully.')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s, role:%s' %
            (server['id'], state, expected, role))

        time.sleep(5)

        # set value
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')

        server = util.get_server_by_role(self.cluster['servers'], 'master')

        redis = redis_mgmt.Redis(server_to_join['id'])
        ret = redis.connect(server_to_join['ip'], server_to_join['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check value
        for i in range(0, 20000):
            cmd = 'get %s%d\r\n' % (key_base, i)
            redis.write(cmd)
            redis.read_until('\r\n')
            response = redis.read_until('\r\n')
            self.assertEqual(response, '%d\r\n' % (i),
                             'inconsistent %s, %d' % (response[:-2], i))

        gw.disconnect()
        return 0
Exemple #14
0
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(s1)
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(
            success, True, 'failed to rejoin as a slave. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        success = False
        for i in range(20):
            role = util.get_role_of_server(s2)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(s2)
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(
            success, True, 'failed to rejoin as a slave. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Exemple #15
0
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # wait for forced master election
        success = False
        master = None
        for i in range(20):
            role = util.get_role_of_server(s1)
            ts = util.get_timestamp_of_pgs(s1)
            if role == c.ROLE_MASTER and ts == s1_ts:
                master = s1
                success = True
                break
            role = util.get_role_of_server(s2)
            ts = util.get_timestamp_of_pgs(s2)
            if role == c.ROLE_MASTER and ts == s2_ts:
                master = s2
                success = True
                break
            role = util.get_role_of_server(m)
            ts = util.get_timestamp_of_pgs(m)
            if role == c.ROLE_MASTER and ts == m_ts:
                master = m
                success = True
                break
            time.sleep(1)

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        self.assertEqual(success, True, 'failed to forced master election')

        servers = [m, s1, s2]
        for s in servers:
            if s != master:
                for i in range(20):
                    role = util.get_role_of_server(s)
                    if role == c.ROLE_SLAVE:
                        success = True
                        break
                    time.sleep(1)
                self.assertEqual(
                    success, True, 'failed to rejoin as a slave, %s:%d' %
                    (s['ip'], s['smr_mgmt_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Exemple #16
0
    def state_transition(self):
        server = util.get_server_by_role(self.cluster['servers'], 'slave')
        self.assertNotEquals(server, None,
                             'failed to get_server_by_role-slave')

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])

        # check initial state
        state = self.get_expected_smr_state(server, 'N')
        role = util.get_role_of_server(server)
        self.assertEquals(
            'N', state, 'server%d - state:%s, role:%s, expected:N' %
            (server['id'], state, role))

        # shutdown
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        time.sleep(3)

        # check state F
        expected = 'F'
        state = self.get_expected_smr_state(server, expected)
        self.assertEquals(
            expected, state, 'server%d - state:%s, but expected:%s' %
            (server['id'], state, expected))

        # set value
        ret = gw.connect(ip, port)
        self.assertEquals(ret, 0,
                          'failed to connect to gateway, %s:%d' % (ip, port))
        timestamp = 0.0
        for i in range(0, 100):
            timestamp = time.time()
            key = 'new_key_haha'
            cmd = 'set %s %f\r\n' % (key, timestamp)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # recovery
        ret = testbase.request_to_start_smr(server)
        self.assertEquals(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEquals(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server, 10)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))
        time.sleep(5)

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        expected = 'N'
        max_try = 20
        for i in range(0, max_try):
            state = self.get_expected_smr_state(server, expected)
            if state == expected:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, role:%s, but expected:%s' %
            (server['id'], state, role, expected))
Exemple #17
0
    def consistent_after_failover(self):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(ip)
        gw.connect(ip, port)

        for i in range(0, max):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        time.sleep(5)

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr(server)
            self.assertEqual(
                ret, 0, 'failed to shutdown smr, server:%d' % server['id'])
            ret = testbase.request_to_shutdown_redis(server)
            self.assertEquals(ret, 0, 'failed to shutdown redis')
        time.sleep(5)

        # check state F
        for server in servers:
            state = self.get_expected_smr_state(server, 'F')
            self.assertEquals('F', state,
                              'server%d - state:%s' % (server['id'], state))

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(ret, 0,
                             'failed to start smr, server:%d' % server['id'])

            ret = testbase.request_to_start_redis(server, False)
            self.assertEqual(ret, 0,
                             'failed to start redis, server:%d' % server['id'])

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep(5)

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.leader_cm['ip'],
                                     self.leader_cm['cm_port'])
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role(
                server, wait_count)
            self.assertEquals(
                ret, 0, 'failed to role change. server:%d' % (server['id']))

            state = self.get_expected_smr_state(server, 'N')
            role = util.get_role_of_server(server)
            self.assertEquals(
                'N', state,
                'server%d - state:%s, role:%s' % (server['id'], state, role))

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server(server)
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue(
            1 == the_number_of_master and 2 == the_number_of_slave,
            'failed to set roles, the number of master:%d, the number of slave:%d'
            % (the_number_of_master, the_number_of_slave))

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis(master['id'])
        ret = redis.connect(master['ip'], master['redis_port'])
        self.assertEquals(
            ret, 0, 'failed to connect to redis, server:%d' % master['id'])

        for i in range(max, max * 2):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write(cmd)
            res = redis.read_until('\r\n')
            self.assertEquals(
                res, '+OK\r\n',
                'failed to get response, server:%d' % master['id'])
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis(slave['id'])
            ret = slave_redis.connect(slave['ip'], slave['redis_port'])
            self.assertEquals(
                ret, 0, 'failed to connect to redis, server:%d' % slave['id'])

            for i in range(0, max * 2):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write(cmd)
                trash = slave_redis.read_until('\r\n')
                res = slave_redis.read_until('\r\n')
                self.assertEquals(
                    res, '%d\r\n' % i,
                    'inconsistent, server:%d, expected %d but %s' %
                    (slave['id'], i, res))
            slave_redis.disconnect()
    def test_quorum_with_left_pgs( self ):
        util.print_frame()

        # start load generators
        load_gen_list = {}
        for i in range( len(self.cluster['servers']) ):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (m['cluster_name'], m['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # check if pgs is removed
        success = False
        for try_cnt in range( 10 ):
            redis = redis_mgmt.Redis( m['id'] )
            ret = redis.connect( m['ip'], m['redis_port'] )
            self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
            util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )

            redis.write( 'info stats\r\n' )
            for i in range( 6 ):
                redis.read_until( '\r\n' )
            res = redis.read_until( '\r\n' )
            self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
            util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) )
            no = int( res.split(':')[1] )
            if no <= 100:
                success = True
                break
            time.sleep( 1 )

        self.assertEquals( success, True, 'failed : pgs does not removed.' )
        util.log( 'succeeded : pgs is removed' )

        # check states of all pgs in pg
        for s in self.cluster['servers']:
            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual( self.quorum_policy[1], quorum_of_haning_master,
                          'invalid quorum of left master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) )
        util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master )

        # 'role lconn' to master
        cmd = 'role lconn\r\n'
        ret = util.cmd_to_smr( m, cmd )
        self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # wait for master election
        success = False
        new_master = None
        for i in range( 10 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to elect new master' )
        util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] )

        time.sleep( 1 )
        # check the numbers of master, slave, and lconn
        cnt_master = 0
        cnt_slave = 0
        cnt_lconn = 0
        for s in self.cluster['servers']:
            role = util.get_role_of_server( s )
            if role == c.ROLE_MASTER:
                cnt_master = cnt_master + 1
            elif role == c.ROLE_SLAVE:
                cnt_slave = cnt_slave + 1
            elif role == c.ROLE_LCONN:
                cnt_lconn = cnt_lconn + 1
        self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master )
        self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave )
        self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn )

        # check states of all pgs in pg
        for s in self.cluster['servers']:
            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( self.quorum_policy[1], quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        # shutdown load generators
        for i in range( len(load_gen_list) ):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        return 0
Exemple #19
0
    def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # initial data
        util.put_some_data(self.cluster)

        master, s1, s2 = util.get_mss(self.cluster)

        server_to_join = [s1, s2]
        # shutdown slaves
        for i in range(0, 2):
            ret = testbase.request_to_shutdown_smr(server_to_join[i])
            self.assertEqual(
                ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id'])
            util.log('succeeded to shutdown smr%d' % server_to_join[i]['id'])

            ret = testbase.request_to_shutdown_redis(server_to_join[i])
            self.assertEquals(ret, 0, 'failed to shutdown redis')
            util.log('succeeded to shutdown redis%d' % server_to_join[i]['id'])

            # check state F
            max_try = 20
            expected = 'F'
            for j in range(0, max_try):
                state = util.get_smr_state(server_to_join[i], self.leader_cm)
                if expected == state:
                    break
                time.sleep(1)
            self.assertEquals(
                expected, state, 'server%d - state:%s, expected:%s' %
                (server_to_join[i]['id'], state, expected))

        # put more data
        util.put_some_data(self.cluster, 10, 256)

        # bgsave
        ret = util.bgsave(master)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown master
        ret = testbase.request_to_shutdown_smr(master)
        self.assertEqual(ret, 0, 'failed to shutdown smr')
        util.log('succeeded to shutdown master smr, id=%d' % master['id'])
        ret = testbase.request_to_shutdown_redis(master)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        util.log('succeeded to shutdown master redis, id=%d' % master['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(master, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (master['id'], state, expected))

        # recovery slaves
        for i in range(0, 2):
            ret = testbase.request_to_start_smr(server_to_join[i])
            self.assertEqual(ret, 0, 'failed to start smr')

            ret = testbase.request_to_start_redis(server_to_join[i])
            self.assertEqual(ret, 0, 'failed to start redis')

            ret = testbase.wait_until_finished_to_set_up_role(
                server_to_join[i], 10)
            self.assertEquals(
                ret, 0,
                'failed to role change. smr_id:%d' % (server_to_join[i]['id']))

            # check state N
            max_try = 20
            expected = 'N'
            for j in range(0, max_try):
                state = util.get_smr_state(server_to_join[i], self.leader_cm)
                if expected == state:
                    break
                time.sleep(1)
            role = util.get_role_of_server(server_to_join[i])
            self.assertEquals(
                expected, state, 'server%d - state:%s, expected:%s, role:%s' %
                (server_to_join[i]['id'], state, expected, role))

        # set value
        s = random.choice(server_to_join)
        redis = redis_mgmt.Redis(['id'])
        ret = redis.connect(s['ip'], s['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        key_base = 'key_test'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            redis.write(cmd)
            res = redis.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        redis.disconnect()

        for i in range(0, 2):
            redis = redis_mgmt.Redis(server_to_join[i]['id'])
            ret = redis.connect(server_to_join[i]['ip'],
                                server_to_join[i]['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')

            # check value
            for j in range(0, 10000):
                cmd = 'get %s%d\r\n' % (key_base, j)
                redis.write(cmd)
                redis.read_until('\r\n')
                response = redis.read_until('\r\n')
                self.assertEqual(response, '%d\r\n' % (j),
                                 'inconsistent %s, %d' % (response[:-2], j))

        # try to recover master, but failed
        ret = testbase.request_to_start_smr(master)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(master, False)
        self.assertEqual(ret, 0, 'failed to start redis')

        max_try = 3
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(master, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(master)
        self.assertNotEqual(
            expected, state, 'server%d - state:%s, expected:not %s, role:%s' %
            (master['id'], state, expected, role))
        util.log(
            'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.'
        )

        gw.disconnect()
        return 0
    def master_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(5)

        # wait for forced master election
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server(s2)
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep(1)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        self.assertEqual(success, True, 'failed to forced master election')

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check if the haning server recovered and joined as a slave
        time.sleep(7)
        role = util.get_role_of_server(m)
        self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave')

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
    def failover_while_hang(self, server):
        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs(server)
        self.assertNotEqual(
            ts_before, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (server['id'], ts_before))

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' %
                 (server['id'], server['ip'], server['smr_mgmt_port']))
        smr = smr_mgmt.SMR(server['id'])
        ret = smr.connect(server['ip'], server['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (server['ip'], server['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(4)

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # shutdown
        util.log('shutdown pgs%d while hanging.' % server['id'])
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEqual(ret, 0,
                         'failed to shutdown smr. id:%d' % server['id'])
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0,
                          'failed to shutdown redis. id:%d' % server['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # recovery
        util.log('restart pgs%d.' % server['id'])
        ret = testbase.request_to_start_smr(server)
        self.assertEqual(ret, 0, 'failed to start smr. id:%d' % server['id'])

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis. id:%d' % server['id'])

        wait_count = 20
        ret = testbase.wait_until_finished_to_set_up_role(server, wait_count)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to N.' % server['id'])

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(server)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(server)
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to rejoin as a slave')
        util.log('succeeded : pgs%d joined as a slave.' % server['id'])

        return 0
Exemple #22
0
    def __del_server(self, server_to_del):
        # backup data
        redis = redis_mgmt.Redis( server_to_del['id'] )
        ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] )
        self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) )

        # bgsave
        ret = util.bgsave(server_to_del)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id'])

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        r = util.get_role_of_server(server_to_del)
        # If quorum of left master is larger than 1, info command will be blocked.
        if r != c.ROLE_MASTER:
            # check if pgs is removed
            success = False
            for try_cnt in range( 10 ):
                redis = redis_mgmt.Redis( server_to_del['id'] )
                ret = redis.connect( server_to_del['ip'], server_to_del['redis_port'] )
                self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) )
                util.log( 'succeeded : connect to smr%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) )

                redis.write( 'info stats\r\n' )
                for i in range( 6 ):
                    redis.read_until( '\r\n' )
                res = redis.read_until( '\r\n' )
                self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port']) )
                util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (server_to_del['id'], server_to_del['ip'], server_to_del['redis_port'], res[:-2]) )
                no = int( res.split(':')[1] )
                if no <= 100:
                    success = True
                    break
                time.sleep( 1 )

            self.assertEquals( success, True, 'failed : pgs does not removed.' )
        util.log( 'pgs is removed' )

        # change state of pgs to lconn
        cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # shutdown
        ret = testbase.request_to_shutdown_smr( server_to_del )
        self.assertEqual( ret, 0, 'failed : shutdown smr. id:%d' % server_to_del['id'] )
        ret = testbase.request_to_shutdown_redis( server_to_del )
        self.assertEquals( ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id'] )
        util.log('succeeded : shutdown pgs%d.' % server_to_del['id'] )

        # delete pgs from cluster
        cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'], server_to_del['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
Exemple #23
0
    def elect_master_randomly(self):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway('0')
        gw.connect(ip, port)
        for i in range(0, 1000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to gw(%s:%d). cmd:%s, res:%s' %
                (ip, port, cmd[:-2], res[:-2]))

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append(server['id'])

        for try_cnt in range(30):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
            util.log('master id : %d' % m['id'])

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove(m['id'])

            smr = smr_mgmt.SMR(m['id'])
            ret = smr.connect(m['ip'], m['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (m['ip'], m['smr_mgmt_port']))
            cmd = 'role lconn\r\n'
            smr.write(cmd)
            reply = smr.read_until('\r\n')
            self.assertEqual(
                reply, '+OK\r\n',
                'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]))
            util.log('succeeded : cmd="%s", reply="%s"' %
                     (cmd[:-2], reply[:-2]))

            # wait until role-change is finished
            for role_change_try_cnt in range(5):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server(server)
                    real_role = util.roleNumberToChar(real_role)
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break
                time.sleep(1)

            # check the number of master and slave
            self.assertEqual(
                count_master, 1,
                'failed : the number of master is not 1, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            self.assertEqual(
                count_slave, 2,
                'failed : the number of slave is not 2, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            util.log(
                'succeeded : the number of master is 1 and the number of slave is 2'
            )

            # check states of all pgs in pg
            for try_cnt in range(3):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server(s)
                    real_role = util.roleNumberToChar(real_role)
                    smr_info = util.get_smr_info(s, self.leader_cm)
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log(
                            'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))
                    else:
                        util.log(
                            '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))

                if ok == False:
                    time.sleep(0.5)
                else:
                    break

            self.assertTrue(ok, 'failed : role check')

            if len(server_ids) == 0:
                util.log('succeeded : all smrs have been as a master')
                return 0

        self.assertEqual(
            0, len(server_ids), 'failed : remains server ids=[%s]' %
            (','.join('%d' % id for id in server_ids)))
        return 0
    def test_quorum_with_left_pgs( self ):
        util.print_frame()

        # start load generators
        load_gen_list = {}
        for i in range( len(self.cluster['servers']) ):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual(2, quorum_of_haning_master,
                          'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master )

        # check if pgs is removed
        r = util.get_role_of_server(m)
        if r != c.ROLE_MASTER:
            success = False
            for try_cnt in range( 10 ):
                redis = redis_mgmt.Redis( m['id'] )
                ret = redis.connect( m['ip'], m['redis_port'] )
                self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
                util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )

                redis.write( 'info stats\r\n' )
                for i in range( 6 ):
                    redis.read_until( '\r\n' )
                res = redis.read_until( '\r\n' )
                self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
                util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) )
                no = int( res.split(':')[1] )
                if no <= 100:
                    success = True
                    break

                time.sleep( 1 )

            self.assertEquals( success, True, 'failed : pgs does not removed.' )
        util.log( 'pgs is removed' )

        # check states of all pgs in pg
        for i in xrange(10):
            for s in self.cluster['servers']:
                smr_info = util.get_smr_info( s, self.leader_cm )
                cc_role = smr_info['smr_Role']
                cc_hb = smr_info['hb']
                if cc_hb == 'N':
                    continue

                real_role = util.get_role_of_server( s )
                real_role = util.roleNumberToChar( real_role )
                if real_role != cc_role:
                    time.sleep(0.5)
                    continue

        for s in self.cluster['servers']:
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue

            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual(2, quorum_of_haning_master,
                          'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master )

        # 'role lconn' to master
        cmd = 'role lconn\r\n'
        ret = util.cmd_to_smr( m, cmd )
        self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # wait for master election
        success = False
        new_master = None
        for i in range( 10 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to elect new master' )
        util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] )

        time.sleep( 1 )
        # check the numbers of master, slave, and lconn
        cnt_master = 0
        cnt_slave = 0
        cnt_lconn = 0
        for s in self.cluster['servers']:
            role = util.get_role_of_server( s )
            if role == c.ROLE_MASTER:
                cnt_master = cnt_master + 1
            elif role == c.ROLE_SLAVE:
                cnt_slave = cnt_slave + 1
            elif role == c.ROLE_LCONN:
                cnt_lconn = cnt_lconn + 1
        self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master )
        self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave )
        self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn )

        # check states of all pgs in pg
        for s in self.cluster['servers']:
            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( 1, quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        # shutdown load generators
        for i in range( len(load_gen_list) ):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        # Go back to initial configuration
        self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']),
                'failed to recover pgs, (pgs_join)')

        return 0
Exemple #25
0
    def __del_server(self, server_to_del):
        # backup data
        redis = redis_mgmt.Redis(server_to_del['id'])
        ret = redis.connect(server_to_del['ip'], server_to_del['redis_port'])
        self.assertEquals(
            ret, 0, 'failed : connect to smr%d(%s:%d)' %
            (server_to_del['id'], server_to_del['ip'],
             server_to_del['redis_port']))

        # bgsave
        ret = util.bgsave(server_to_del)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % server_to_del['id'])

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (server_to_del['cluster_name'],
                                       server_to_del['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        r = util.get_role_of_server(server_to_del)
        # If quorum of left master is larger than 1, info command will be blocked.
        if r != c.ROLE_MASTER:
            # check if pgs is removed
            success = False
            for try_cnt in range(10):
                redis = redis_mgmt.Redis(server_to_del['id'])
                ret = redis.connect(server_to_del['ip'],
                                    server_to_del['redis_port'])
                self.assertEquals(
                    ret, 0, 'failed : connect to smr%d(%s:%d)' %
                    (server_to_del['id'], server_to_del['ip'],
                     server_to_del['redis_port']))
                util.log('succeeded : connect to smr%d(%s:%d)' %
                         (server_to_del['id'], server_to_del['ip'],
                          server_to_del['redis_port']))

                redis.write('info stats\r\n')
                for i in range(6):
                    redis.read_until('\r\n')
                res = redis.read_until('\r\n')
                self.assertNotEqual(
                    res, '',
                    'failed : get reply of "info stats" from redis%d(%s:%d)' %
                    (server_to_del['id'], server_to_del['ip'],
                     server_to_del['redis_port']))
                util.log(
                    'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"'
                    % (server_to_del['id'], server_to_del['ip'],
                       server_to_del['redis_port'], res[:-2]))
                no = int(res.split(':')[1])
                if no <= 100:
                    success = True
                    break
                time.sleep(1)

            self.assertEquals(success, True, 'failed : pgs does not removed.')
        util.log('pgs is removed')

        # change state of pgs to lconn
        cmd = 'pgs_lconn %s %d\r\n' % (server_to_del['cluster_name'],
                                       server_to_del['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        # shutdown
        ret = testbase.request_to_shutdown_smr(server_to_del)
        self.assertEqual(ret, 0,
                         'failed : shutdown smr. id:%d' % server_to_del['id'])
        ret = testbase.request_to_shutdown_redis(server_to_del)
        self.assertEquals(
            ret, 0, 'failed : shutdown redis. id:%d' % server_to_del['id'])
        util.log('succeeded : shutdown pgs%d.' % server_to_del['id'])

        # delete pgs from cluster
        cmd = 'pgs_del %s %d\r\n' % (server_to_del['cluster_name'],
                                     server_to_del['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
Exemple #26
0
    def failover_while_hang( self, server ):
        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs( server )
        self.assertNotEqual( ts_before, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (server['id'], ts_before) )

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' % (server['id'], server['ip'], server['smr_mgmt_port']))
        smr = smr_mgmt.SMR( server['id'] )
        ret = smr.connect( server['ip'], server['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (server['ip'], server['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 4 )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

        # shutdown
        util.log( 'shutdown pgs%d while hanging.' % server['id'] )
        ret = testbase.request_to_shutdown_smr( server )
        self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] )
        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

        # recovery
        util.log( 'restart pgs%d.' % server['id'] )
        ret = testbase.request_to_start_smr( server )
        self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] )

        ret = testbase.request_to_start_redis( server )
        self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] )

        wait_count = 20
        ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
        util.log( 'succeeded : pgs%d state changed to N.' % server['id'] )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( server )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( server )
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave' )
        util.log( 'succeeded : pgs%d joined as a slave.' % server['id'] )

        return 0
    def test_4_PGS_mgen_is_less_than_PG_mgen( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # initial data
        util.put_some_data(self.cluster)

        # shutdown
        server_to_join = util.get_server_by_role( self.cluster['servers'], 'master' )
        ret = testbase.request_to_shutdown_smr( server_to_join )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        ret = testbase.request_to_shutdown_redis( server_to_join )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server_to_join, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected) )

        # set value
        key_base = 'mw'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )

        # master failover 1 (master generation + 1)
        util.log('master failover 1')
        server = util.get_server_by_role( self.cluster['servers'], 'master' )
        self.failover( server )

        # check quorum (copy:3, quorum:1, available:2)
        ok = False
        for i in xrange(10):
            ok = util.check_quorum(self.cluster['cluster_name'],
                    self.leader_cm['ip'], self.leader_cm['cm_port'])
            if ok:
                break
            else:
                time.sleep(1)
        self.assertTrue( ok, 'Check quorum fail.' )

        # master failover 2 (master generation + 1)
        util.log('master failover 2')
        server = util.get_server_by_role( self.cluster['servers'], 'master' )
        self.failover( server )

        # recovery
        util.log('master recovery start.')
        ret = testbase.request_to_start_smr( server_to_join )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server_to_join )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server_to_join, 10 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id']) )
        util.log('master recovery end successfully.')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )

        time.sleep( 5 )

        # set value
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )

        server = util.get_server_by_role( self.cluster['servers'], 'master' )

        redis = redis_mgmt.Redis( server_to_join['id'] )
        ret = redis.connect( server_to_join['ip'], server_to_join['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check value
        for i in range(0, 20000):
            cmd = 'get %s%d\r\n' % (key_base, i)
            redis.write( cmd )
            redis.read_until( '\r\n'  )
            response = redis.read_until( '\r\n'  )
            self.assertEqual( response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i) )

        gw.disconnect()
        return 0
Exemple #28
0
    def failure_recovery(self, role, wait_count=10, redis_only=False):
        time.sleep(2)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set value
        key = 'new_key_haha'
        cmd = 'set %s 12345\r\n' % (key)
        gw.write(cmd)
        res = gw.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        # shutdown
        server = util.get_server_by_role(self.cluster['servers'], role)

        if redis_only == False:
            ret = testbase.request_to_shutdown_smr(server)
            self.assertEqual(ret, 0, 'failed to shutdown smr')

        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0, 'failed to shutdown redis')

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))

        # set value
        check_value = '54321'
        cmd = 'set %s %s\r\n' % (key, check_value)
        gw.write(cmd)
        res = gw.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')
        gw.disconnect()

        # recovery
        if redis_only == False:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(server, wait_count)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(server)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s, role:%s' %
            (server['id'], state, expected, role))

        # check value
        cmd = 'get %s\r\n' % (key)
        redis.write(cmd)
        redis.read_until('\r\n')
        response = redis.read_until('\r\n')
        self.assertEqual(response, '%s\r\n' % (check_value),
                         'inconsistent %s, %s' % (response, check_value))
Exemple #29
0
    def test_two_slaves_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs( s1 )
        self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) )

        ts_before2 = util.get_timestamp_of_pgs( s2 )
        self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) )

        # hang
        smr1 = smr_mgmt.SMR( s1['id'] )
        ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr2 = smr_mgmt.SMR( s2['id'] )
        ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr1.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr1.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr2.write( 'fi delay sleep 1 8000\r\n' )
        time.sleep( 7 )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s1 )
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s2 )
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
    def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # initial data
        util.put_some_data(self.cluster)

        master, s1, s2 = util.get_mss(self.cluster)

        server_to_join = [s1, s2]
        # shutdown slaves
        for i in range(0, 2):
            ret = testbase.request_to_shutdown_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id'])
            util.log('succeeded to shutdown smr%d' % server_to_join[i]['id'])

            ret = testbase.request_to_shutdown_redis( server_to_join[i] )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
            util.log('succeeded to shutdown redis%d' % server_to_join[i]['id'])

            # check state F
            max_try = 20
            expected = 'F'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) )

        # put more data
        util.put_some_data(self.cluster, 10, 256)

        # bgsave
        ret = util.bgsave(master)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown master
        ret = testbase.request_to_shutdown_smr( master )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        util.log('succeeded to shutdown master smr, id=%d' % master['id'])
        ret = testbase.request_to_shutdown_redis( master )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        util.log('succeeded to shutdown master redis, id=%d' % master['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (master['id'], state, expected) )

        # recovery slaves
        for i in range(0, 2):
            ret = testbase.request_to_start_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start smr' )

            ret = testbase.request_to_start_redis( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start redis' )

            ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 )
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) )

            # check state N
            max_try = 20
            expected = 'N'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            role = util.get_role_of_server( server_to_join[i] )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) )

        # set value
        s = random.choice(server_to_join)
        redis = redis_mgmt.Redis( ['id'] )
        ret = redis.connect( s['ip'], s['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        key_base = 'key_test'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        redis.disconnect()

        for i in range(0, 2):
            redis = redis_mgmt.Redis( server_to_join[i]['id'] )
            ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis' )

            # check value
            for j in range(0, 10000):
                cmd = 'get %s%d\r\n' % (key_base, j)
                redis.write( cmd )
                redis.read_until( '\r\n'  )
                response = redis.read_until( '\r\n'  )
                self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) )

        # try to recover master, but failed
        ret = testbase.request_to_start_smr( master )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( master, False )
        self.assertEqual( ret, 0, 'failed to start redis' )

        max_try = 3
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( master )
        self.assertNotEqual( expected, state,
                             'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) )
        util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.')

        gw.disconnect()
        return 0
Exemple #31
0
    def master_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 5 )

        # wait for forced master election
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server( s2 )
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep( 1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        self.assertEqual( success, True, 'failed to forced master election' )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check if the haning server recovered and joined as a slave
        time.sleep( 7 )
        role = util.get_role_of_server( m )
        self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
    def failure_recovery( self, role, wait_count=10, redis_only=False ):
        time.sleep( 2 )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set value
        key = 'new_key_haha'
        cmd = 'set %s 12345\r\n' % (key)
        gw.write( cmd )
        res = gw.read_until( '\r\n' )
        self.assertEquals( res, '+OK\r\n' )

        # shutdown
        server = util.get_server_by_role( self.cluster['servers'], role )

        if redis_only == False:
            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

        ret = testbase.request_to_shutdown_redis( server )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (server['id'], state, expected) )

        # set value
        check_value = '54321'
        cmd = 'set %s %s\r\n' % (key, check_value)
        gw.write( cmd )
        res = gw.read_until( '\r\n' )
        self.assertEquals( res, '+OK\r\n' )
        gw.disconnect()

        # recovery
        if redis_only == False:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( server )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

        redis = redis_mgmt.Redis( server['id'] )
        ret = redis.connect( server['ip'], server['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        # check state N
        max_try = 20
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( server, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( server )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role) )

        # check value
        cmd = 'get %s\r\n' % (key)
        redis.write( cmd )
        redis.read_until( '\r\n'  )
        response = redis.read_until( '\r\n'  )
        self.assertEqual( response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value) )