Example #1
0
def unblock_network(cluster, mgmt_ip, mgmt_port, final_state):
    # Unblock
    if util.iptables_drop('D', '127.0.0.100') == False:
        util.log('delete a bloking role to iptables fail.')
        return False

    # Check cluster state
    for i in range(3):
        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state)
        time.sleep(1)

    return True
def unblock_network(cluster, mgmt_ip, mgmt_port, final_state):
    # Unblock
    out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
    if out.succeeded == False:
        util.log('delete a bloking role to iptables fail. output:%s' % out)
        return False

    # Check cluster state
    for i in range(3):
        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state)
        time.sleep(1)

    return True
Example #3
0
def block_network(cluster, mgmt_ip, mgmt_port):
    # Block
    if util.iptables_drop('A', '127.0.0.100') == False:
        util.log('add a bloking role to iptables fail.')
        return False

    for i in range(4):
        util.log('waiting... %d' % (i + 1))
        time.sleep(1)

    # Check cluster state
    for i in range(2):
        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port)
        time.sleep(1)

    return True
def block_network(cluster, mgmt_ip, mgmt_port):
    # Block
    out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
    if out.succeeded == False:
        util.log('add a bloking role to iptables fail. output:%s' % out)
        return False

    for i in range(4):
        util.log('waiting... %d' % (i + 1))
        time.sleep(1)

    # Check cluster state
    for i in range(2):
        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port)
        time.sleep(1)

    return True
Example #5
0
 def __check_cluster_state(self):
     leader_cm = classify_cm(self.__servers)['leader'][0]
     for cluster in self.__cmi['cluster']:
         ret = util. await (30)(lambda x: x, lambda: util.check_cluster(
             cluster['clusterName'].encode('ascii'), leader_cm['ip'],
             leader_cm['port'], None, True))
         if ret == False:
             return False
     return True
Example #6
0
 def __check_cluster_state(self):
     leader_cm = classify_cm(self.__servers)['leader'][0]
     for cluster in self.__cmi['cluster']:
         ret = util.await(30)(
             lambda x: x,
             lambda : util.check_cluster(cluster['clusterName'].encode('ascii'), leader_cm['ip'], leader_cm['port'], None, True))
         if ret == False:
             return False
     return True
Example #7
0
    def slave_failover_while_hang( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        self.failover_while_hang( s1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )
            util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')
        return 0
Example #8
0
    def master_and_slave_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr_master.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave.write( 'fi delay sleep 1 10000\r\n' )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        time.sleep( 5 )

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range( 15 ):
                state = []
                util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep( 1 )

            util.log( '' )
            util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' )
            util.log( '' )
            util.log_server_state( self.cluster )

            self.assertEqual( success, True, 'failed to check copy-quorum' )

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue( ok, 'Cluster state is not normal!' )

            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write( cmd )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) )

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write( cmd )
                res = redis0.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Example #9
0
    def test_all_pgs_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave1 = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )
        smr_slave2 = smr_mgmt.SMR( s2['id'] )
        ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        smr_master.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave1.write( 'fi delay sleep 1 8000\r\n' )
        smr_slave2.write( 'fi delay sleep 1 8000\r\n' )

        time.sleep( 10 )

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # set values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0 .write( cmd )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check new values (s2)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) )

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Example #10
0
    def test_1_mgmt_is_isolated(self):
        util.print_frame()

        util.iptables_print_list()

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for cnt in range(5):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            for s in cluster['servers']:
                """Loopback Address Range (Reference : RFC3330)

                127.0.0.0/8 - This block is assigned for use as the Internet host
                  loopback address.  A datagram sent by a higher level protocol to an
                  address anywhere within this block should loop back inside the host.
                  This is ordinarily implemented using only 127.0.0.1/32 for loopback,
                  but no addresses within this block should ever appear on any network
                  anywhere [RFC1700, page 5].
                """
                self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            for s in cluster['servers']:
                self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                all_green = True
                for s in final_state:
                    if is_pgs_normal(s) == False:
                        all_green = False

                if all_green:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Example #11
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers,
                                     target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(
                    0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm,
                                     self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1,
                         'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 2
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs(s)
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(
                old_ts, new_ts,
                'Timestamp of a running server has changed. %d->%d' %
                (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state(self.cluster)

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check cluster state
        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(),
                            'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Example #12
0
    def test_5_mgmt_is_isolated_with_master_failover(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for loop_cnt in range(3):
            master, slave1, slave2 = util.get_mss(cluster)
            self.assertNotEquals(master, None, 'there is no master')
            self.assertNotEquals(slave1, None, 'there is no slave1')
            self.assertNotEquals(slave2, None, 'there is no slave2')

            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(10):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Shutdown master
            util.log( 'shutdown pgs%d while hanging.' % master['id'] )
            ret = testbase.request_to_shutdown_smr( master )
            self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % master['id'] )
            ret = testbase.request_to_shutdown_redis( master )
            self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % master['id'] )

            # Check state F
            max_try = 20
            expected = 'F'
            for i in range( 0, max_try):
                util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port))
                state = util._get_smr_state( master['id'], cluster['cluster_name'], mgmt_ip, mgmt_port )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEqual( expected , state,
                               'master%d - state:%s, expected:%s' % (master['id'], state, expected) )
            util.log( 'succeeded : pgs%d state changed to F.' % master['id'] )

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == master['id']:
                        continue

                    if s['active_role'] != s['mgmt_role']:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            # Recovery
            util.log( 'restart pgs%d.' % master['id'] )
            ret = testbase.request_to_start_smr( master )
            self.assertEqual( ret, 0, 'failed to start smr. id:%d' % master['id'] )

            ret = testbase.request_to_start_redis( master )
            self.assertEqual( ret, 0, 'failed to start redis. id:%d' % master['id'] )

            wait_count = 20
            ret = testbase.wait_until_finished_to_set_up_role( master, wait_count )
            self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (master['id']) )

            redis = redis_mgmt.Redis( master['id'] )
            ret = redis.connect( master['ip'], master['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis' )

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)

            self.assertTrue(ok, 'failed to check cluster state')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(0, 3):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        for i in range(3, 6):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
Example #13
0
    def test_6_repeat_isolation_and_no_opinion_linepay(self):
        util.print_frame()

        util.iptables_print_list()

        # Add forwarding role
        self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')
        self.assertTrue(util.iptables_redirect('A', '127.0.0.101', '127.0.0.1'), 'add a forwarding role to iptables fail.')
        self.assertTrue(util.iptables_redirect('A', '127.0.0.102', '127.0.0.1'), 'add a forwarding role to iptables fail.')

        cluster_name = 'no_opinion'
        cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        loop_cnt = 0
        while (loop_cnt < 20):
            loop_cnt += 1
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.102'), 'add a bloking role to iptables fail.')

            for i in range(1):
                util.log('waiting... %d' % (i + 1))
                time.sleep(0.1)

            self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.')

            for i in range(3):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1.2)

            self.assertTrue(util.iptables_drop('A', '127.0.0.101'), 'add a bloking role to iptables fail.')

            for i in range(1):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt)
            self.assertTrue(util.iptables_drop('D', '127.0.0.102'), 'delete a bloking role to iptables fail.')
            for i in range(0):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)
            self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.')
            for i in range(0):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)
            self.assertTrue(util.iptables_drop('D', '127.0.0.101'), 'delete a bloking role to iptables fail.')
            for i in range(3):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Print state of cluster
            util.log('\n ### STATE OF CLUSTER ### ')
            cluster_state = False
            for i in range(10):
                cluster_state = util.check_cluster(cluster_name, mgmt_ip, mgmt_port, initial_state, check_quorum=True)
                if cluster_state == True:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(cluster_state, 'failed to check cluster state')

            all_in_f = True
            for s in cluster['servers']:
                if checkLastState(mgmt_ip, s['cm_port'], cluster_name, 0, 'F') == False:
                    all_in_f = False
                    break

            self.assertFalse(all_in_f, 'PGS0`s last state remains in F')

        # Delete forwarding role
        self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail.')
        self.assertTrue(util.iptables_redirect('D', '127.0.0.101', '127.0.0.1'), 'delete a forwarding role to iptables fail.')
        self.assertTrue(util.iptables_redirect('D', '127.0.0.102', '127.0.0.1'), 'delete a forwarding role to iptables fail.')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
Example #14
0
    def test_2_some_pgs_is_isolated(self):
        util.print_frame()

        util.iptables_print_list()

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Place master on virtual ip address in order to cause master election.
        pg_id = 0
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
        s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
        if m.has_key('ip') == True and m.has_key('real_ip') == False:
            ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
            self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
    def test_7_dirty_network_fi(self):
        util.print_frame()
        clnts = []

        try:
            out = util.sudo('iptables -L')
            util.log('====================================================================')
            util.log('out : %s' % out)
            util.log('out.return_code : %d' % out.return_code)
            util.log('out.stderr : %s' % out.stderr)
            util.log('out.succeeded : %s' % out.succeeded)

            # Add forwarding role
            out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

            cluster_name = 'network_isolation_cluster_1'
            cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
            util.log(util.json_to_str(cluster))

            self.leader_cm = cluster['servers'][0]

            # MGMT
            mgmt_ip = cluster['servers'][0]['real_ip']
            mgmt_port = cluster['servers'][0]['cm_port']

            # Create cluster
            ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, 
                    conf={'cm_context':'applicationContext-fi.xml'})
            self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

            # Print initial state of cluster
            util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
            initial_state = []
            self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

            # Start crc16 client
            for s in cluster['servers']:
                c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False)
                c.start()
                clnts.append(c)

            # Network isolation test
            cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], 
                                                ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1)

            for fi in cmfi:
                # Block network
                util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi))
                ret = block_network(cluster, mgmt_ip, mgmt_port)
                self.assertTrue(ret, '[%s] failed to block network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)

                    state_transition_done = True
                    for s in isolated_states:
                        if s['ip'] != '127.0.0.100':
                            continue

                        if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                            state_transition_done = False

                    if state_transition_done:
                        ok = True
                        break
                    time.sleep(1)
                self.assertTrue(ok, 'Fail, state transition')

                # Fault injection
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), 
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

                # Unblock network
                util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi))
                ret = unblock_network(cluster, mgmt_ip, mgmt_port, None)
                self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                    if ok:
                        break
                    time.sleep(1)
                self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                check_cluster = False

                # 'bj', 'slave'
                if fi[0] == 'bj' and fi[1] == 'slave':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(s1)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'me', 'lconn'
                elif fi[0] == 'me' and fi[1] == 'lconn':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(m)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'qa', 'setquorum'
                elif fi[0] == 'qa' and fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)

                    # shutdown
                    ret = testbase.request_to_shutdown_smr(s1)
                    self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id']))
                    ret = testbase.request_to_shutdown_redis(s1)
                    self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id']))

                    # Check quorum
                    q = -1
                    for q_cnt in xrange(20):
                        q = util.get_quorum(m)
                        if q == 1:
                            break
                        time.sleep(1)
                    self.assertEquals(1, q, "[%s] check quorum fail." % str(fi))

                    # Modify quorum
                    ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n')
                    self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi))

                    # Check quorum
                    q = -1
                    for q_cnt in xrange(20):
                        q = util.get_quorum(m)
                        if q == 1:
                            break
                        time.sleep(1)
                    self.assertEquals(1, q, "[%s] check quorum fail." % str(fi))

                    # recovery
                    ret = testbase.request_to_start_smr(s1)
                    self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi))
                    ret = testbase.request_to_start_redis(s1, max_try=120)
                    self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi))
                    ret = testbase.wait_until_finished_to_set_up_role(s1, 11)
                    self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id']))

                    check_cluster = True

                # 'setquorum'
                elif fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20)
                    self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret))
                    check_cluster = True

                if check_cluster:
                    # Check cluster state
                    ok = False
                    for i in xrange(20):
                        isolated_states = []
                        ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                        if ok:
                            break
                        time.sleep(1)
                    self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                # Check fault injection
                ok = False
                for i in xrange(10):
                    count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port)
                    if count == 0:
                        ok = True
                        break
                    time.sleep(0.5)
                self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi))

            # Shutdown cluster
            ret = default_cluster.finalize( cluster )
            self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi))

            # Delete forwarding role
            out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

            for c in clnts:
                self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

        finally:
            for c in clnts:
                c.quit()
            for c in clnts:
                c.join()
Example #16
0
    def test_2_some_pgs_is_isolated(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_2', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Place master on virtual ip address in order to cause master election.
        pg_id = 0
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
        s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
        if m.has_key('ip') == True and m.has_key('real_ip') == False:
            ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
            self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != s['mgmt_role']:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
Example #17
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr( target )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

            ret = testbase.request_to_shutdown_redis( target )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )

            r = ''
            expected = 'N'
            for fc_cnt in xrange(20):
                r = util.get_smr_role_of_cm(target, self.leader_cm)
                if r == expected:
                    break
                time.sleep(0.5)
            self.assertEquals(r, expected, 'failure detection error.')

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps= {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr( target )
            self.assertEqual( ret, 0, 'failed to start smr' )
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis( target, 60 )
            self.assertEqual( ret, 0, 'failed to start redis' )
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300)
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) )

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check cluster state
            normal_state = False
            for i in xrange(20):
                normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True)
                if normal_state:
                    break
                time.sleep(0.5)
            self.assertTrue(normal_state, "Unstable cluster state")

            # Check quorum
            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Example #18
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1, 'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 2
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state( self.cluster )

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Check cluster state
        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Example #19
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        for i in range(30):
            target_server = util.get_server_by_role(self.cluster['servers'], 'slave')
            self.assertNotEquals(target_server, None, 'Get slave fail.')
            target = target_server['id']

            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    try:
                        pong = util.pingpong(s['ip'], s['redis_port'])
                        if pong != None and pong == '+PONG\r\n':
                            ok = True
                            break
                    except:
                        pass
                    time.sleep(0.2)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]

        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            s = util.get_server_by_role(servers, 'slave')
            target = s['id']

            # Get old timestamp
            util.log_server_state( self.cluster )
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm, self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state( self.cluster )
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' % (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(), 'Data inconsistency after role_change')
    def test_6_repeat_isolation_and_no_opinion_linepay(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role
        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        cluster_name = 'no_opinion'
        cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        loop_cnt = 0
        while (loop_cnt < 20):
            loop_cnt += 1
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -A OUTPUT -d 127.0.0.102 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(1):
                util.log('waiting... %d' % (i + 1))
                time.sleep(0.1)

            out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(3):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1.2)

            out = util.sudo('iptables -A OUTPUT -d 127.0.0.101 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(1):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.102 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)
            for i in range(0):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)
            for i in range(0):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.101 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)
            for i in range(3):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Print state of cluster
            util.log('\n ### STATE OF CLUSTER ### ')
            cluster_state = False
            for i in range(10):
                cluster_state = util.check_cluster(cluster_name, mgmt_ip, mgmt_port, initial_state, check_quorum=True)
                if cluster_state == True:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(cluster_state, 'failed to check cluster state')

            all_in_f = True
            for s in cluster['servers']:
                if checkLastState(mgmt_ip, s['cm_port'], cluster_name, 0, 'F') == False:
                    all_in_f = False
                    break

            self.assertFalse(all_in_f, 'PGS0`s last state remains in F')

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')

        # Delete forwarding role
        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.101 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.102 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
Example #21
0
    def consistent_after_failover(self):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(ip)
        gw.connect(ip, port)

        for i in range(0, max):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        time.sleep(5)

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr(server)
            self.assertEqual(
                ret, 0, 'failed to shutdown smr, server:%d' % server['id'])
            ret = testbase.request_to_shutdown_redis(server)
            self.assertEquals(ret, 0, 'failed to shutdown redis')
        time.sleep(5)

        # check state F
        for server in servers:
            state = self.get_expected_smr_state(server, 'F')
            self.assertEquals('F', state,
                              'server%d - state:%s' % (server['id'], state))

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(ret, 0,
                             'failed to start smr, server:%d' % server['id'])

            ret = testbase.request_to_start_redis(server, False)
            self.assertEqual(ret, 0,
                             'failed to start redis, server:%d' % server['id'])

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep(5)

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.leader_cm['ip'],
                                     self.leader_cm['cm_port'])
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role(
                server, wait_count)
            self.assertEquals(
                ret, 0, 'failed to role change. server:%d' % (server['id']))

            state = self.get_expected_smr_state(server, 'N')
            role = util.get_role_of_server(server)
            self.assertEquals(
                'N', state,
                'server%d - state:%s, role:%s' % (server['id'], state, role))

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server(server)
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue(
            1 == the_number_of_master and 2 == the_number_of_slave,
            'failed to set roles, the number of master:%d, the number of slave:%d'
            % (the_number_of_master, the_number_of_slave))

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis(master['id'])
        ret = redis.connect(master['ip'], master['redis_port'])
        self.assertEquals(
            ret, 0, 'failed to connect to redis, server:%d' % master['id'])

        for i in range(max, max * 2):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write(cmd)
            res = redis.read_until('\r\n')
            self.assertEquals(
                res, '+OK\r\n',
                'failed to get response, server:%d' % master['id'])
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis(slave['id'])
            ret = slave_redis.connect(slave['ip'], slave['redis_port'])
            self.assertEquals(
                ret, 0, 'failed to connect to redis, server:%d' % slave['id'])

            for i in range(0, max * 2):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write(cmd)
                trash = slave_redis.read_until('\r\n')
                res = slave_redis.read_until('\r\n')
                self.assertEquals(
                    res, '%d\r\n' % i,
                    'inconsistent, server:%d, expected %d but %s' %
                    (slave['id'], i, res))
            slave_redis.disconnect()
Example #22
0
    def pgs_add_and_del(self, upgrade_server, type):
        util.print_frame()

        util.log('[start] add and del pgs%d. type:%s' %
                 (upgrade_server['id'], type))
        util.log_server_state(self.cluster)

        # start load generator
        load_gen_list = {}
        for i in range(len(self.cluster['servers'])):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'],
                                                    server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'],
                                       upgrade_server['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        # set new values
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway('0')
        gw.connect(ip, port)
        for i in range(0, 50):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to gw(%s:%d). cmd:%s, res:%s' %
                (ip, port, cmd[:-2], res[:-2]))

        # attach pgs to cluster
        cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'],
                                      upgrade_server['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(jobj['msg'], '+OK',
                         'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        time.sleep(3)

        stable = False
        for i in xrange(20):
            stable = util.check_cluster(self.cluster['cluster_name'],
                                        self.leader_cm['ip'],
                                        self.leader_cm['cm_port'])
            if stable:
                break
            time.sleep(0.5)
        self.assertTrue(stable, 'Unstable cluster')

        # check new values
        redis = redis_mgmt.Redis(upgrade_server['id'])
        ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port'])
        self.assertEquals(
            ret, 0, 'failed : connect to smr%d(%s:%d)' %
            (upgrade_server['id'], upgrade_server['ip'],
             upgrade_server['redis_port']))

        for i in range(0, 50):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis.write(cmd)
            redis.read_until('\r\n')
            res = redis.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis%d. %s != %d' %
                (upgrade_server['id'], res, i))
        util.log('succeeded : check values with get operations on pgs%d.' %
                 (upgrade_server['id']))

        # shutdown load generators
        for i in range(len(load_gen_list)):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        util.log_server_state(self.cluster)

        return 0
Example #23
0
    def test_3_some_pgs_is_isolated_2copy(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1_2copy', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Place master on real ip address
        for pg_id in [0, 1]:
            m = util.get_server_by_role_and_pg(cluster['servers'], 'master', pg_id)
            s = util.get_server_by_role_and_pg(cluster['servers'], 'slave', pg_id)
            if m.has_key('ip') and m.has_key('real_ip'):
                if m['ip'] != m['real_ip']:
                    ret = util.role_change(cluster['servers'][0], cluster['cluster_name'], s['id'])
                    self.assertNotEquals(ret, -1, 'change %d to a master fail' % s['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100'), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] == 0 or s['pgs_id'] == 1:
                        continue

                    if s['active_role'] != 'M' or s['mgmt_role'] != 'M':
                        state_transition_done = False

                    if s['quorum'] != 0:
                        state_transition_done = False

                if state_transition_done:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            self.assertTrue(util.iptables_drop('D', '127.0.0.100'), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                if util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True) == False:
                    time.sleep(1)
                    continue

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'delete a forwarding role to iptables fail.')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
    def master_and_slave_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave = smr_mgmt.SMR(s1['id'])
        ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr_master.write('fi delay sleep 1 10000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave.write('fi delay sleep 1 10000\r\n')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        time.sleep(5)

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range(15):
                state = []
                util.check_cluster(self.cluster['cluster_name'],
                                   self.leader_cm['ip'],
                                   self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep(1)

            util.log('')
            util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2')
            util.log('')
            util.log_server_state(self.cluster)

            self.assertEqual(success, True, 'failed to check copy-quorum')

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'],
                                        self.leader_cm['ip'],
                                        self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue(ok, 'Cluster state is not normal!')

            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write(cmd)
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis1. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis1(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write(cmd)
                res = redis0.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis0. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Example #25
0
    def test_4_mgmt_is_isolated_with_red_failover(self):
        util.print_frame()

        util.iptables_print_list()

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port)

        # Master must be the first pgs, cluster['servers'][0].
        to_be_master = cluster['servers'][0]
        m = util.get_server_by_role_and_pg(cluster['servers'], 'master', to_be_master['pg_id'])
        master_id = -1
        if m['id'] != to_be_master['id']:
            try_cnt = 0
            while master_id != to_be_master['id'] and try_cnt < 20:
                master_id = util.role_change(cluster['servers'][0], cluster['cluster_name'], to_be_master['id'])
                try_cnt += 1
                time.sleep(1)
            self.assertEquals(master_id, to_be_master['id'], 'change %d to a master fail' % to_be_master['id'])

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for loop_cnt in range(3):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            for s in cluster['servers']:
                self.assertTrue(util.iptables_drop('A', '127.0.0.100', s['smr_mgmt_port']), 'add a bloking role to iptables fail.')

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            pgs_list = util.get_pgs_info_list(mgmt_ip, mgmt_port, cluster)
            reds = filter(lambda x: x['color'] == 'RED', pgs_list)

            # Shutdown
            server = cluster['servers'][random.choice(reds)['pgs_id']]
            util.log( 'shutdown pgs%d while hanging.' % server['id'] )
            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % server['id'] )
            ret = testbase.request_to_shutdown_redis( server )
            self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % server['id'] )

            # Check state F
            max_try = 20
            expected = 'F'
            for i in range( 0, max_try):
                util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port))
                state = util._get_smr_state( server['id'], cluster['cluster_name'], mgmt_ip, mgmt_port )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEqual( expected , state,
                               'server%d - state:%s, expected:%s' % (server['id'], state, expected) )
            util.log( 'succeeded : pgs%d state changed to F.' % server['id'] )

            # Unblock network
            for s in cluster['servers']:
                self.assertTrue(util.iptables_drop('D', '127.0.0.100', s['smr_mgmt_port']), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(10):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == server['id']:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            # Recovery
            util.log( 'restart pgs%d.' % server['id'] )
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr. id:%d' % server['id'] )

            ret = testbase.request_to_start_redis( server )
            self.assertEqual( ret, 0, 'failed to start redis. id:%d' % server['id'] )

            wait_count = 20
            ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
            self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) )

            redis = redis_mgmt.Redis( server['id'] )
            ret = redis.connect( server['ip'], server['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis' )

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(ok, 'failed to check cluster state')

            # Reset SMR option (slave_idle_timeout)
            t = telnet.Telnet('SMR%d' % server['id'])
            self.assertEqual(t.connect(server['ip'], server['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (server['ip'], server['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (server['ip'], server['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (server['ip'], server['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            if initial_state[i]['pgs_id'] == 1:
                self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        success = False
        for i in xrange(20):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.mgmt_ip,
                                     self.mgmt_port,
                                     check_quorum=True)
            if ret:
                success = True
                break
            time.sleep(1)
        self.assertEqual(success, True, 'unstable cluster')

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Example #27
0
    def test_5_mgmt_is_isolated_with_lconn(self):
        util.print_frame()

        util.iptables_print_list()

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for loop_cnt in range(3):
            # Get master
            master = util.get_server_by_role_and_pg( cluster['servers'], 'master', 0 )

            first_slave = None
            for s in cluster['servers']:
                if s == master:
                    continue

                # Skip non-virtual host
                if s.has_key('real_ip') == False:
                    continue

                if first_slave == None:
                    first_slave = s

                # 'role lconn'
                util.log( 'role lconn pgs%d while hanging.' % s['id'] )

                ret = util.role_lconn_addr( s['real_ip'], s['smr_mgmt_port']  )
                self.assertEqual( ret, '+OK\r\n', 'role lconn failed. reply="%s"' % (ret[:-2]) )
                util.log( 'succeeded : cmd="role lconn", reply="%s"' % (ret[:-2]) )
                time.sleep(0.5)

            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            self.assertTrue(util.iptables_drop('A', '127.0.0.100', first_slave['smr_mgmt_port']), 'add a bloking role to iptables fail.')

            for i in range(6):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(10):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['pgs_id'] != first_slave['id']:
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            self.assertTrue(util.iptables_drop('D', '127.0.0.100', first_slave['smr_mgmt_port']), 'delete a bloking role to iptables fail.')

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == 1:
                        continue

                    if is_pgs_normal(s) == False:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)
            self.assertTrue(ok, 'failed to check cluster state')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        self.assertTrue(conf_checker.final_check())

        # Shutdown cluster
        default_cluster.finalize(cluster)
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
Example #29
0
    def test_7_dirty_network_fi(self):
        util.print_frame()
        clnts = []

        try:
            util.iptables_print_list()

            # Add forwarding role
            self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

            cluster_name = 'network_isolation_cluster_1'
            cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
            util.log(util.json_to_str(cluster))

            self.leader_cm = cluster['servers'][0]

            # MGMT
            mgmt_ip = cluster['servers'][0]['real_ip']
            mgmt_port = cluster['servers'][0]['cm_port']

            # Create cluster
            conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster, 
                    conf={'cm_context':'applicationContext-fi.xml'})
            self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

            # Print initial state of cluster
            util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
            initial_state = []
            self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

            # Start crc16 client
            for s in cluster['servers']:
                c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False)
                c.start()
                clnts.append(c)

            # Network isolation test
            cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], 
                                                ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1)

            for fi in cmfi:
                # Block network
                util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi))
                ret = block_network(cluster, mgmt_ip, mgmt_port)
                self.assertTrue(ret, '[%s] failed to block network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)

                    state_transition_done = True
                    for s in isolated_states:
                        if s['ip'] != '127.0.0.100':
                            continue

                        if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                            state_transition_done = False

                    if state_transition_done:
                        ok = True
                        break
                    time.sleep(1)
                self.assertTrue(ok, 'Fail, state transition')

                # Fault injection
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), 
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

                # Unblock network
                util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi))
                ret = unblock_network(cluster, mgmt_ip, mgmt_port, None)
                self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                    if ok:
                        break
                    time.sleep(1)
                self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                check_cluster = False

                # 'bj', 'slave'
                if fi[0] == 'bj' and fi[1] == 'slave':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(s1)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'me', 'lconn'
                elif fi[0] == 'me':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(m)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'setquorum'
                elif fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20)
                    self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret))
                    check_cluster = True

                if check_cluster:
                    # Check cluster state
                    ok = False
                    for i in xrange(20):
                        isolated_states = []
                        ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                        if ok:
                            break
                        time.sleep(1)
                    self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                # Check fault injection
                ok = False
                for i in xrange(10):
                    count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port)
                    if count == 0:
                        ok = True
                        break
                    time.sleep(0.5)
                self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi))

                for c in clnts:
                    self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

            for c in clnts:
                self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

            # Go back to initial configuration
            cmfi.init()
            for fi in cmfi:
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 0, mgmt_ip, mgmt_port),
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

            # Wait until workflows done
            ret = util.await(60, True)(
                    lambda cinfo: cinfo['wf'] == 0,
                    lambda : util.cluster_info(mgmt_ip, mgmt_port, cluster['cluster_name']))
            self.assertTrue(ret, 'There are still some workflows.')

            self.assertTrue(conf_checker.final_check())

            # Shutdown cluster
            default_cluster.finalize(cluster)

        finally:
            for c in clnts:
                c.quit()
            for c in clnts:
                c.join()

            # Delete forwarding role
            self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')
    def master_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(5)

        # wait for forced master election
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server(s2)
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep(1)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        self.assertEqual(success, True, 'failed to forced master election')

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check if the haning server recovered and joined as a slave
        time.sleep(7)
        role = util.get_role_of_server(m)
        self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave')

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Example #31
0
    def test_1_role_change(self):
        util.print_frame()

        self.load_gen_list = {}

        # Start load generator
        util.log("Start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Loop (smr: 3 copy)
        for i in range(30):
            target_server = util.get_server_by_role(self.cluster['servers'],
                                                    'slave')
            self.assertNotEquals(target_server, None, 'Get slave fail.')
            target = target_server['id']

            print ''
            util.log("(3 copy) Loop:%d, target pgs:%d" % (i, target))

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target + 1) % 3
            util.log('Change role success.')

            # Wait until role change finished
            for s in self.cluster['servers']:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    try:
                        pong = util.pingpong(s['ip'], s['redis_port'])
                        if pong != None and pong == '+PONG\r\n':
                            ok = True
                            break
                    except:
                        pass
                    time.sleep(0.2)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in self.cluster['servers']:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(3):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Loop (smr: 2 copy)
        self.__del_server(self.cluster['servers'][0])
        servers = [self.cluster['servers'][1], self.cluster['servers'][2]]

        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        for i in range(30):
            print ''
            util.log("(2 copy) Loop:%d, target pgs:%d" % (i, target))

            s = util.get_server_by_role(servers, 'slave')
            target = s['id']

            # Get old timestamp
            util.log_server_state(self.cluster)
            old_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamp_list.append(ts)

            # Role change
            master = util.role_change(self.leader_cm,
                                      self.cluster['cluster_name'], target)
            self.assertNotEqual(master, -1, 'role_change error.')
            while target == master:
                target = (target) % 2 + 1
            util.log('Change role success.')

            # Wait until role change finished
            for s in servers:
                max_try_cnt = 20
                ok = False
                for try_cnt in range(max_try_cnt):
                    pong = util.pingpong(s['ip'], s['redis_port'])
                    if pong != None and pong == '+PONG\r\n':
                        ok = True
                        break
                    time.sleep(0.1)
                self.assertTrue(ok, 'redis state error.')

            # Get new timestamp
            util.log_server_state(self.cluster)
            new_timestamp_list = []
            for s in servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamp_list.append(ts)

            # Compare old timestamps and new timestamps
            for i in range(2):
                self.assertNotEqual(
                    old_timestamp_list[i], new_timestamp_list[i],
                    'Timestamp is not changed. %d->%d' %
                    (old_timestamp_list[i], new_timestamp_list[i]))

            # Cheeck Consistency
            for load_gen_id, load_gen in self.load_gen_list.items():
                self.assertTrue(load_gen.isConsistent(),
                                'Data inconsistency after role_change')

        # Go back to initial configuration
        self.assertTrue(
            util.install_pgs(self.cluster,
                             self.cluster['servers'][0],
                             self.leader_cm,
                             rm_ckpt=False), 'failed to recover pgs.')
    def master_failover_while_hang(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        self.failover_while_hang(m)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(m['id'])
        ret = redis1.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))
            util.log(
                'succeeded : check values with set/get operations with pgs%d and pgs%d.'
                % (m['id'], s2['id']))

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
Example #33
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr(target)
            self.assertEqual(ret, 0, 'failed to shutdown smr')

            ret = testbase.request_to_shutdown_redis(target)
            self.assertEquals(ret, 0, 'failed to shutdown redis')

            r = ''
            expected = 'N'
            for fc_cnt in xrange(20):
                r = util.get_smr_role_of_cm(target, self.leader_cm)
                if r == expected:
                    break
                time.sleep(0.5)
            self.assertEquals(r, expected, 'failure detection error.')

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm,
                                         self.cluster['cluster_name'],
                                         s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(
                    old_ts, new_ts,
                    'Timestamp of a running server has not changed. %d->%d' %
                    (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok, 'unexpected quorum(after role change). expected:%s' %
                (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' %
                     (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr(target)
            self.assertEqual(ret, 0, 'failed to start smr')
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis(target, 60)
            self.assertEqual(ret, 0, 'failed to start redis')
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role(target,
                                                              max_try=300)
            self.assertEquals(
                ret, 0, 'failed to role change. smr_id:%d' % (target['id']))

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check cluster state
            normal_state = False
            for i in xrange(20):
                normal_state = util.check_cluster(self.cluster['cluster_name'],
                                                  self.leader_cm['ip'],
                                                  self.leader_cm['cm_port'],
                                                  check_quorum=True)
                if normal_state:
                    break
                time.sleep(0.5)
            self.assertTrue(normal_state, "Unstable cluster state")

            # Check quorum
            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(
                ok,
                'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(),
                                'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
Example #34
0
    def test_large_scale_master_election(self):
        util.print_frame()

        # initialize cluster information
        pgs_id = 10
        cluster = {
            'cluster_name': 'large_scale',
            'keyspace_size': 8192,
            'quorum_policy': '0:1',
            'slots': [],
            'pg_id_list': [],
            'servers': []
        }
        pg_max = 32
        pgs_per_pg = 3
        for pg_id in range(pg_max):
            cluster['pg_id_list'].append(pg_id)
            cluster['slots'].append(8192 / pg_max * pg_id)
            if pg_id == pg_max - 1:
                cluster['slots'].append(8191)
            else:
                cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1)

            for pgs in range(pgs_per_pg):
                smr_base_port = 15000 + pgs_id * 20
                smr_mgmt_port = smr_base_port + 3
                gateway_port = smr_base_port + 10
                redis_port = smr_base_port + 9

                server = {}
                server['id'] = pgs_id
                pgs_id = pgs_id + 1
                server['cluster_name'] = cluster['cluster_name']
                server['ip'] = self.cluster['servers'][0]['ip']
                server['pm_name'] = self.cluster['servers'][0]['pm_name']
                server['cm_port'] = None
                server['pg_id'] = pg_id
                server['smr_base_port'] = smr_base_port
                server['smr_mgmt_port'] = smr_mgmt_port
                server['gateway_port'] = gateway_port
                server['redis_port'] = redis_port
                server['zk_port'] = 2181

                cluster['servers'].append(server)

        # send initialize commands to confmaster
        testbase.initialize_cluster(cluster, self.leader_cm)

        # set up pgs binaries
        try:
            for server in cluster['servers']:
                id = server['id']

                util.log('copy binaries, server_id=%d' % id)
                util.copy_smrreplicator(id)
                util.copy_gw(id)
                util.copy_redis_server(id)
                util.copy_cluster_util(id)

        except IOError as e:
            util.log(e)
            util.log('Error: can not find file or read data')
            self.assertEqual(0, 1, 'Error: can not find file or read data')

        except:
            util.log('Error: file open error.')

        # cleanup servers`s directories
        for server in cluster['servers']:
            ret = testbase.cleanup_pgs_log_and_ckpt(cluster['cluster_name'],
                                                    server)
            self.assertEqual(
                ret, 0,
                'failed to cleanup_test_environment, id=%d' % server['id'])

        # start pgs
        for server in cluster['servers']:
            ret = testbase.request_to_start_smr(server)
            self.assertEqual(
                ret, 0, 'failed to request_to_start_smr, id=%d' % server['id'])

        for server in cluster['servers']:
            ret = testbase.request_to_start_redis(server, check=False)
            self.assertEqual(
                ret, 0, 'failed to request_to_start_smr, id=%d' % server['id'])

        for server in cluster['servers']:
            ret = testbase.wait_until_finished_to_set_up_role(server)
            self.assertEqual(ret, 0,
                             'failed to role set up, id=%d' % server['id'])

        for i in range(4):
            server = cluster['servers'][i]
            ret = testbase.request_to_start_gateway(cluster['cluster_name'],
                                                    server, self.leader_cm)
            self.assertEqual(
                ret, 0,
                'failed to request_to_start_gateway, id=%d' % server['id'])

        clusters = cluster_ls()
        self.assertNotEqual(len(clusters), 0, 'There is no clsuter.')

        ok = True
        for c in clusters:
            if not util.check_cluster(str(c),
                                      self.leader_cm['ip'],
                                      self.leader_cm['cm_port'],
                                      check_quorum=True):
                ok = False

        self.assertEqual(ok, True, 'failed to initlize roles of pgs')
Example #35
0
    def test_two_slaves_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs( s1 )
        self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) )

        ts_before2 = util.get_timestamp_of_pgs( s2 )
        self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) )

        # hang
        smr1 = smr_mgmt.SMR( s1['id'] )
        ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr2 = smr_mgmt.SMR( s2['id'] )
        ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr1.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr1.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr2.write( 'fi delay sleep 1 8000\r\n' )
        time.sleep( 7 )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s1 )
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s2 )
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
Example #36
0
    def pgs_add_and_del( self, upgrade_server, type ):
        util.print_frame()

        util.log( '[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type) )
        util.log_server_state( self.cluster )

        # start load generator
        load_gen_list = {}
        for i in range( len(self.cluster['servers']) ):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # set new values
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway( '0' )
        gw.connect( ip, port )
        for i in range( 0, 50 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) )

        # attach pgs to cluster
        cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        time.sleep( 3 )

        stable = False
        for i in xrange(20):
            stable = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'])
            if stable:
                break;
            time.sleep(0.5)
        self.assertTrue(stable, 'Unstable cluster')

        # check new values
        redis = redis_mgmt.Redis( upgrade_server['id'] )
        ret = redis.connect( upgrade_server['ip'], upgrade_server['redis_port'] )
        self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port']) )

        for i in range( 0, 50 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis.write( cmd )
            redis.read_until( '\r\n' )
            res = redis.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i) )
        util.log( 'succeeded : check values with get operations on pgs%d.' % (upgrade_server['id']) )

        # shutdown load generators
        for i in range( len(load_gen_list) ):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        util.log_server_state( self.cluster )

        return 0
Example #37
0
    def master_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 5 )

        # wait for forced master election
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server( s2 )
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep( 1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        self.assertEqual( success, True, 'failed to forced master election' )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check if the haning server recovered and joined as a slave
        time.sleep( 7 )
        role = util.get_role_of_server( m )
        self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
    def consistent_after_failover( self ):
        max = 10000
        wait_count = 15
        key = 'caf'

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # set value
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( ip )
        gw.connect( ip, port )

        for i in range( 0, max ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        time.sleep( 5 )

        # shutdown
        servers = [master, slave1, slave2]
        for server in servers:

            util.log('before shutdown pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

            ret = testbase.request_to_shutdown_smr( server )
            self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id'] )
            ret = testbase.request_to_shutdown_redis( server )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
        time.sleep( 5 )

        # check state F
        for server in servers:
            state = self.get_expected_smr_state( server, 'F' )
            self.assertEquals( 'F', state,
                               'server%d - state:%s' % (server['id'], state) )

        # recovery
        for server in servers:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual( ret, 0, 'failed to start smr, server:%d' % server['id'] )

            ret = testbase.request_to_start_redis( server, False )
            self.assertEqual( ret, 0, 'failed to start redis, server:%d' % server['id']  )

            util.log('after restart pgs%d' % server['id'])
            for s in servers:
                self.getseq_log(s)

        time.sleep( 5 )

        # wait for master election
        for i in xrange(10):
            ret = util.check_cluster( self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'] )
            if ret:
                break
            time.sleep(1)

        # check state
        for server in servers:
            ret = testbase.wait_until_finished_to_set_up_role( server, wait_count )
            self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id']) )

            state = self.get_expected_smr_state( server, 'N' )
            role = util.get_role_of_server( server )
            self.assertEquals( 'N', state,
                               'server%d - state:%s, role:%s' % (server['id'], state, role) )

        the_number_of_master = 0
        the_number_of_slave = 0
        for server in servers:
            role = util.get_role_of_server( server )
            if role == c.ROLE_MASTER:
                the_number_of_master = the_number_of_master + 1
            elif role == c.ROLE_SLAVE:
                the_number_of_slave = the_number_of_slave + 1
        self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave,
                           'failed to set roles, the number of master:%d, the number of slave:%d' %
                           (the_number_of_master, the_number_of_slave) )

        # get master, slave1, and slave2
        master, slave1, slave2 = self.get_mss()

        # connect to a master`s redis and set data
        redis = redis_mgmt.Redis( master['id'] )
        ret = redis.connect( master['ip'], master['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id'] )

        for i in range( max, max*2 ):
            cmd = 'set %s%d %d\r\n' % (key, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n',
                               'failed to get response, server:%d' % master['id'] )
        redis.disconnect()

        # check slaves`s data
        slaves = [slave1, slave2]
        for slave in slaves:
            slave_redis = redis_mgmt.Redis( slave['id'] )
            ret = slave_redis .connect( slave['ip'], slave['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id'] )

            for i in range( 0, max*2 ):
                cmd = 'get %s%d\r\n' % (key, i)
                slave_redis.write( cmd )
                trash = slave_redis.read_until( '\r\n' )
                res = slave_redis.read_until( '\r\n' )
                self.assertEquals( res, '%d\r\n' % i,
                                   'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)  )
            slave_redis.disconnect()
Example #39
0
    def test_large_scale_master_election( self ):
        util.print_frame()

        # initialize cluster information
        pgs_id = 10
        cluster = {
                    'cluster_name' : 'large_scale',
                    'keyspace_size' : 8192,
                    'quorum_policy' : '0:1',
                    'slots' : [],
                    'pg_id_list' : [],
                    'servers' : []
                  }
        pg_max = 32
        pgs_per_pg = 3
        for pg_id in range(pg_max):
            cluster['pg_id_list'].append(pg_id)
            cluster['slots'].append(8192 / pg_max * pg_id)
            if pg_id == pg_max - 1:
                cluster['slots'].append(8191)
            else:
                cluster['slots'].append(8192 / pg_max * (pg_id + 1) - 1)

            for pgs in range(pgs_per_pg):
                smr_base_port = 15000 + pgs_id * 20
                smr_mgmt_port = smr_base_port + 3
                gateway_port = smr_base_port + 10
                redis_port = smr_base_port + 9

                server = {}
                server['id'] = pgs_id
                pgs_id = pgs_id + 1
                server['cluster_name'] = cluster['cluster_name']
                server['ip'] = self.cluster['servers'][0]['ip']
                server['pm_name'] = self.cluster['servers'][0]['pm_name']
                server['cm_port'] = None
                server['pg_id'] = pg_id
                server['smr_base_port'] = smr_base_port
                server['smr_mgmt_port'] = smr_mgmt_port
                server['gateway_port'] = gateway_port
                server['redis_port'] = redis_port
                server['zk_port'] = 2181

                cluster['servers'].append(server)

        # send initialize commands to confmaster
        testbase.initialize_cluster(cluster, self.leader_cm)

        # set up pgs binaries
        try:
            for server in cluster['servers']:
                id = server['id']

                util.log('copy binaries, server_id=%d' % id)
                util.copy_smrreplicator( id )
                util.copy_gw( id )
                util.copy_redis_server( id )
                util.copy_cluster_util( id )

        except IOError as e:
            util.log(e)
            util.log('Error: can not find file or read data')
            self.assertEqual(0, 1, 'Error: can not find file or read data')

        except:
            util.log('Error: file open error.')

        # cleanup servers`s directories
        for server in cluster['servers']:
            ret = testbase.cleanup_pgs_log_and_ckpt( cluster['cluster_name'], server )
            self.assertEqual(ret, 0, 'failed to cleanup_test_environment, id=%d' % server['id'])

        # start pgs
        for server in cluster['servers']:
            ret = testbase.request_to_start_smr( server )
            self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id'])

        for server in cluster['servers']:
            ret = testbase.request_to_start_redis( server, check=False )
            self.assertEqual(ret, 0, 'failed to request_to_start_smr, id=%d' % server['id'])

        for server in cluster['servers']:
            ret = testbase.wait_until_finished_to_set_up_role(server)
            self.assertEqual(ret, 0, 'failed to role set up, id=%d' % server['id'])

        for i in range(4):
            server = cluster['servers'][i]
            ret = testbase.request_to_start_gateway( cluster['cluster_name'], server, self.leader_cm )
            self.assertEqual(ret, 0, 'failed to request_to_start_gateway, id=%d' % server['id'])

        clusters = cluster_ls()
        self.assertNotEqual(len(clusters), 0, 'There is no clsuter.')

        ok = True
        for c in clusters:
            if not util.check_cluster(str(c), self.leader_cm['ip'], self.leader_cm['cm_port'], check_quorum=True):
                ok = False

        self.assertEqual(ok, True, 'failed to initlize roles of pgs')
Example #40
0
    def test_1_mgmt_is_isolated(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Set SMR option (slave_idle_timeout)
        util.log('\n\n\n ### Set SMR option ###')
        for s in cluster['servers']:
            t = telnet.Telnet('SMR%d' % s['id'])
            self.assertEqual(t.connect(s['ip'], s['smr_mgmt_port']), 0,
                    'Failed to connect to smr. ADDR=%s:%d' % (s['ip'], s['smr_mgmt_port']))
            cmd = 'confset slave_idle_timeout_msec 18000'
            util.log('[%s:%d] >> %s' % (s['ip'], s['smr_mgmt_port'], cmd))
            t.write('confset slave_idle_timeout_msec 18000\r\n')
            reply = t.read_until('\r\n').strip()
            util.log('[%s:%d] << %s' % (s['ip'], s['smr_mgmt_port'], reply))
            self.assertEqual(reply, '+OK', 'Failed to set slave_idle_timeout, REPLY=%s' % reply)

        # Network isolation test
        for cnt in range(5):
            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % cnt)
            for s in cluster['servers']:
                out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -p tcp --dport %d -j DROP' % (s['smr_mgmt_port']))
                """Loopback Address Range (Reference : RFC3330)

                127.0.0.0/8 - This block is assigned for use as the Internet host
                  loopback address.  A datagram sent by a higher level protocol to an
                  address anywhere within this block should loop back inside the host.
                  This is ordinarily implemented using only 127.0.0.1/32 for loopback,
                  but no addresses within this block should ever appear on any network
                  anywhere [RFC1700, page 5].
                """
                self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(7):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % cnt)
            for s in cluster['servers']:
                out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -p tcp --dport %d -j DROP' % (s['smr_mgmt_port']))
                self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['active_role'] != s['mgmt_role']:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(len(final_state)):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')