コード例 #1
0
    def test_2_role_change_with_hanging_pgs(self):
        util.print_frame()

        i = 0
        while i < 5:
            util.log('')
            util.log('Loop:%d' % i)

            # get master, slave1, slave2
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            hang = random.choice(self.cluster['servers'])
            if hang == m:
                hanging_servers = [m]
                running_servers = [s1, s2]
                type = 'master'
            else:
                hanging_servers = [s1]
                running_servers = [m, s2]
                type = 'slave'
            s = random.choice([s1, s2])

            util.log('hanging pgs(id:%d, type:%s), expected_master_id:%d' % (hang['id'], type, s['id']))
            self.role_change_with_hanging_pgs(hanging_servers, running_servers, s['id'], m)

            i += 1
コード例 #2
0
    def test_quorum_policy_of_hanging_master( self ):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 15000\r\n' )
        time.sleep( 5 )

        # wait for forced master election
        success = False
        new_master = None
        for i in range( 7 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to forced master election' )

        # shutdown confmaster
        for server in self.cluster['servers']:
            util.shutdown_cm( server['id'] )

        # wait until hanging master wake up
        time.sleep( 5 )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual( 2, quorum_of_haning_master,
                          'invalid quorum of haning master, expected:%d, but:%d' %(2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( 1, quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        # Go back to initial configuration
        # Recover Confmaster
        self.assertTrue(util.recover_confmaster(self.cluster, [0,1,2], 0), 'failed to recover confmaster')

        return 0
コード例 #3
0
ファイル: test_upgrade.py プロジェクト: ducky-hong/nbase-arc
    def test_upgrade_slave_smr( self ):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        ret = util.upgrade_pgs( s1, self.leader_cm, self.cluster )
        self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s1['id'])
コード例 #4
0
    def test_3_role_change_while_all_pgs_hanging(self):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        hanging_servers = [m, s1, s2]
        running_servers = []
        s = random.choice([s1, s2])

        self.role_change_with_hanging_pgs(hanging_servers, running_servers, s['id'], m)
        return 0
コード例 #5
0
ファイル: test_upgrade.py プロジェクト: ducky-hong/nbase-arc
    def test_upgrade_smr_repeatedly( self ):
        util.print_frame()

        execution_count_master = 0
        execution_count_slave = 0
        old_target = None
        for cnt in range( 5 ):
            target = random.choice( self.cluster['servers'] )
            while target == old_target:
                target = random.choice( self.cluster['servers'] )
            old_target = target

            role = util.get_role_of_server( target )
            if role == c.ROLE_SLAVE:
                ret = util.upgrade_pgs( target, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % target['id'])

                execution_count_master = execution_count_master + 1
            elif role == c.ROLE_MASTER:
                ret = util.upgrade_pgs( target, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade master pgs%d' % target['id'])
                execution_count_slave = execution_count_slave + 1
            else:
                self.fail( 'unexpected role:%s' % role )
            time.sleep( 1 )

            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )

            if execution_count_master == 0:
                ret = util.upgrade_pgs( m, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade master pgs%d' % m['id'])
            if execution_count_slave == 0:
                ret = util.upgrade_pgs( s2, self.leader_cm, self.cluster )
                self.assertTrue(ret, 'Failed to upgrade slave pgs%d' % s2['id'])
コード例 #6
0
    def test_7_dirty_network_fi(self):
        util.print_frame()
        clnts = []

        try:
            out = util.sudo('iptables -L')
            util.log('====================================================================')
            util.log('out : %s' % out)
            util.log('out.return_code : %d' % out.return_code)
            util.log('out.stderr : %s' % out.stderr)
            util.log('out.succeeded : %s' % out.succeeded)

            # Add forwarding role
            out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

            cluster_name = 'network_isolation_cluster_1'
            cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
            util.log(util.json_to_str(cluster))

            self.leader_cm = cluster['servers'][0]

            # MGMT
            mgmt_ip = cluster['servers'][0]['real_ip']
            mgmt_port = cluster['servers'][0]['cm_port']

            # Create cluster
            ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, 
                    conf={'cm_context':'applicationContext-fi.xml'})
            self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

            # Print initial state of cluster
            util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
            initial_state = []
            self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

            # Start crc16 client
            for s in cluster['servers']:
                c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False)
                c.start()
                clnts.append(c)

            # Network isolation test
            cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], 
                                                ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1)

            for fi in cmfi:
                # Block network
                util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi))
                ret = block_network(cluster, mgmt_ip, mgmt_port)
                self.assertTrue(ret, '[%s] failed to block network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)

                    state_transition_done = True
                    for s in isolated_states:
                        if s['ip'] != '127.0.0.100':
                            continue

                        if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                            state_transition_done = False

                    if state_transition_done:
                        ok = True
                        break
                    time.sleep(1)
                self.assertTrue(ok, 'Fail, state transition')

                # Fault injection
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), 
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

                # Unblock network
                util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi))
                ret = unblock_network(cluster, mgmt_ip, mgmt_port, None)
                self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                    if ok:
                        break
                    time.sleep(1)
                self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                check_cluster = False

                # 'bj', 'slave'
                if fi[0] == 'bj' and fi[1] == 'slave':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(s1)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'me', 'lconn'
                elif fi[0] == 'me':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(m)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'setquorum'
                elif fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20)
                    self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret))
                    check_cluster = True

                if check_cluster:
                    # Check cluster state
                    ok = False
                    for i in xrange(20):
                        isolated_states = []
                        ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                        if ok:
                            break
                        time.sleep(1)
                    self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                # Check fault injection
                ok = False
                for i in xrange(10):
                    count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port)
                    if count == 0:
                        ok = True
                        break
                    time.sleep(0.5)
                self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi))

                for c in clnts:
                    self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

            # Shutdown cluster
            ret = default_cluster.finalize( cluster )
            self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi))

            # Delete forwarding role
            out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

            for c in clnts:
                self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

        finally:
            for c in clnts:
                c.quit()
            for c in clnts:
                c.join()
コード例 #7
0
    def test_4_role_change_with_failover(self):
        util.print_frame()

        loop_cnt = 0
        while loop_cnt < 5:
            util.log('')
            util.log('Loop:%d' % loop_cnt)

            util.log("States (before role change)")
            util.log_server_state(self.cluster)

            target = random.choice(self.cluster['servers'])

            # bgsave
            ret = util.bgsave(target)
            self.assertTrue(ret, 'failed to bgsave. pgs:%d' % target['id'])

            # shutdown
            util.log('shutdown pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_shutdown_smr( target )
            self.assertEqual( ret, 0, 'failed to shutdown smr' )

            ret = testbase.request_to_shutdown_redis( target )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )

            running_servers = []
            for s in self.cluster['servers']:
                if s != target:
                    running_servers.append(s)

            # Get old timestamp
            old_timestamps = {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs(s)
                old_timestamps[s['id']] = ts

            # Start load generator
            self.load_gen_list = {}
            util.log('start load generator')
            for i in range(self.max_load_generator):
                ip, port = util.get_rand_gateway(self.cluster)
                load_gen = load_generator.LoadGenerator(i, ip, port)
                load_gen.start()
                self.load_gen_list[i] = load_gen

            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

            # Role change
            master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], s1['id'])
            self.assertNotEqual(master_id, -1, 'role_change failed')

            util.log("States (after role change)")
            util.log_server_state(self.cluster)

            # Check - get new timestamp
            new_timestamps= {}
            for s in running_servers:
                ts = util.get_timestamp_of_pgs( s )
                new_timestamps[s['id']] = ts

            # Check - compare old timestamps and new timestamps
            for s in running_servers:
                old_ts = old_timestamps[s['id']]
                new_ts = new_timestamps[s['id']]
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a running server has not changed. %d->%d' % (old_ts, new_ts))

            # Check quorum
            m = self.cluster['servers'][master_id]
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after role change). expected:%s' % (expected))

            # recovery
            util.log('recovery pgs%d(%s:%d)' % (target['id'], target['ip'], target['smr_base_port']))
            ret = testbase.request_to_start_smr( target )
            self.assertEqual( ret, 0, 'failed to start smr' )
            util.log('start smr-replicator done')

            ret = testbase.request_to_start_redis( target, 60 )
            self.assertEqual( ret, 0, 'failed to start redis' )
            util.log('start redis-arc done')

            ret = testbase.wait_until_finished_to_set_up_role( target, max_try=300)
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) )

            util.log("States (after recovery)")
            util.log_server_state(self.cluster)

            # Check quorum
            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'unexpected quorum(after recovery). expected:%s' % (expected))

            # Cheeck Consistency
            util.log('stop load generator')
            for i in range(self.max_load_generator):
                self.load_gen_list[i].quit()
            for i in range(self.max_load_generator):
                self.load_gen_list[i].join()
                self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
                self.load_gen_list.pop(i, None)

            loop_cnt += 1

        return 0
コード例 #8
0
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers, target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(ret, 0, 'failed to connect to master. %s:%d' % (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm, self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1, 'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 1
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state( self.cluster )

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 1
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok, 'rollback quorum fail. expected:%s' % (expected))

        # Get new timestamp
        new_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs( s )
            new_timestamps[s['id']] = ts

        # Compare old timestamps and new timestamps
        for s in self.cluster['servers']:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps[s['id']]
            if master in hanging_servers and len(running_servers) != 0:
                self.assertNotEqual(old_ts, new_ts, 'Timestamp of a hanging server has not changed. %d->%d' % (old_ts, new_ts))
            else:
                self.assertEqual(old_ts, new_ts, 'Timestamp of a running server has changed. %d->%d' % (old_ts, new_ts))

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
コード例 #9
0
    def test_quorum_policy_of_hanging_master(self):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 15000\r\n')
        time.sleep(5)

        # wait for forced master election
        success = False
        new_master = None
        for i in range(7):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server(s2)
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to forced master election')

        # shutdown confmaster
        for server in self.cluster['servers']:
            util.shutdown_cm(server['id'])

        # wait until hanging master wake up
        time.sleep(5)

        # check quorum policy
        quorum_of_haning_master = util.get_quorum(m)
        self.assertEqual(
            2, quorum_of_haning_master,
            'invalid quorum of haning master, expected:%d, but:%d' %
            (2, quorum_of_haning_master))
        util.log('succeeded : quorum of haning master=%d' %
                 quorum_of_haning_master)

        # check quorum policy
        quorum_of_new_master = util.get_quorum(new_master)
        self.assertNotEqual(None, quorum_of_new_master,
                            'failed : find new master')
        self.assertEqual(
            1, quorum_of_new_master,
            'invalid quorum of new master, expected:%d, but:%d' %
            (1, quorum_of_new_master))
        util.log('succeeded : quorum of new master=%d' % quorum_of_new_master)

        return 0
コード例 #10
0
    def test_quorum(self):
        util.print_frame()

        master, slave1, slave2 = util.get_mss(self.cluster)

        expected = 2
        max_try = 20
        for i in range(0, max_try):
            quorum = util.get_quorum(master)
            if quorum == expected:
                break
            time.sleep(1)
        self.assertEquals(quorum, expected,
                          'quorum:%d, expected:%d' % (quorum, expected))

        ret = testbase.request_to_shutdown_smr(slave1)
        self.assertEqual(ret, 0,
                         'failed to shutdown smr, server:%d' % slave1['id'])
        time.sleep(1)

        expected = 1
        max_try = 20
        for i in range(0, max_try):
            master = util.get_server_by_role(self.cluster['servers'], 'master')
            quorum = util.get_quorum(master)
            if quorum == expected:
                break
            time.sleep(1)
        self.assertEquals(quorum, expected,
                          'quorum:%d, expected:%d' % (quorum, expected))

        ret = testbase.request_to_shutdown_smr(slave2)
        self.assertEqual(ret, 0,
                         'failed to shutdown smr, server:%d' % slave2['id'])
        time.sleep(1)

        expected = 0
        max_try = 20
        for i in range(0, max_try):
            master = util.get_server_by_role(self.cluster['servers'], 'master')
            quorum = util.get_quorum(master)
            if quorum == expected:
                break
            time.sleep(1)
        self.assertEquals(quorum, expected,
                          'quorum:%d, expected:%d' % (quorum, expected))

        # recovery
        ret = testbase.request_to_start_smr(slave1)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(slave1)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(slave1)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (slave1['id']))
        time.sleep(1)

        expected = 1
        max_try = 20
        for i in range(0, max_try):
            quorum = util.get_quorum(master)
            if quorum == expected:
                break
            time.sleep(1)
        self.assertEquals(quorum, expected,
                          'quorum:%d, expected:%d' % (quorum, expected))

        # recovery
        ret = testbase.request_to_start_smr(slave2)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(slave2)
        self.assertEqual(ret, 0, 'failed to start redis')

        ret = testbase.wait_until_finished_to_set_up_role(slave2)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (slave2['id']))
        time.sleep(1)

        expected = 2
        max_try = 20
        for i in range(0, max_try):
            quorum = util.get_quorum(master)
            if quorum == expected:
                break
            time.sleep(1)
        self.assertEquals(quorum, expected,
                          'quorum:%d, expected:%d' % (quorum, expected))
コード例 #11
0
    def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # initial data
        util.put_some_data(self.cluster)

        master, s1, s2 = util.get_mss(self.cluster)

        server_to_join = [s1, s2]
        # shutdown slaves
        for i in range(0, 2):
            ret = testbase.request_to_shutdown_smr(server_to_join[i])
            self.assertEqual(
                ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id'])
            util.log('succeeded to shutdown smr%d' % server_to_join[i]['id'])

            ret = testbase.request_to_shutdown_redis(server_to_join[i])
            self.assertEquals(ret, 0, 'failed to shutdown redis')
            util.log('succeeded to shutdown redis%d' % server_to_join[i]['id'])

            # check state F
            max_try = 20
            expected = 'F'
            for j in range(0, max_try):
                state = util.get_smr_state(server_to_join[i], self.leader_cm)
                if expected == state:
                    break
                time.sleep(1)
            self.assertEquals(
                expected, state, 'server%d - state:%s, expected:%s' %
                (server_to_join[i]['id'], state, expected))

        # put more data
        util.put_some_data(self.cluster, 10, 256)

        # bgsave
        ret = util.bgsave(master)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown master
        ret = testbase.request_to_shutdown_smr(master)
        self.assertEqual(ret, 0, 'failed to shutdown smr')
        util.log('succeeded to shutdown master smr, id=%d' % master['id'])
        ret = testbase.request_to_shutdown_redis(master)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        util.log('succeeded to shutdown master redis, id=%d' % master['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(master, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (master['id'], state, expected))

        # recovery slaves
        for i in range(0, 2):
            ret = testbase.request_to_start_smr(server_to_join[i])
            self.assertEqual(ret, 0, 'failed to start smr')

            ret = testbase.request_to_start_redis(server_to_join[i])
            self.assertEqual(ret, 0, 'failed to start redis')

            ret = testbase.wait_until_finished_to_set_up_role(
                server_to_join[i], 10)
            self.assertEquals(
                ret, 0,
                'failed to role change. smr_id:%d' % (server_to_join[i]['id']))

            # check state N
            max_try = 20
            expected = 'N'
            for j in range(0, max_try):
                state = util.get_smr_state(server_to_join[i], self.leader_cm)
                if expected == state:
                    break
                time.sleep(1)
            role = util.get_role_of_server(server_to_join[i])
            self.assertEquals(
                expected, state, 'server%d - state:%s, expected:%s, role:%s' %
                (server_to_join[i]['id'], state, expected, role))

        # set value
        s = random.choice(server_to_join)
        redis = redis_mgmt.Redis(['id'])
        ret = redis.connect(s['ip'], s['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        key_base = 'key_test'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            redis.write(cmd)
            res = redis.read_until('\r\n')
            self.assertEquals(res, '+OK\r\n')
        redis.disconnect()

        for i in range(0, 2):
            redis = redis_mgmt.Redis(server_to_join[i]['id'])
            ret = redis.connect(server_to_join[i]['ip'],
                                server_to_join[i]['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')

            # check value
            for j in range(0, 10000):
                cmd = 'get %s%d\r\n' % (key_base, j)
                redis.write(cmd)
                redis.read_until('\r\n')
                response = redis.read_until('\r\n')
                self.assertEqual(response, '%d\r\n' % (j),
                                 'inconsistent %s, %d' % (response[:-2], j))

        # try to recover master, but failed
        ret = testbase.request_to_start_smr(master)
        self.assertEqual(ret, 0, 'failed to start smr')

        ret = testbase.request_to_start_redis(master, False)
        self.assertEqual(ret, 0, 'failed to start redis')

        max_try = 3
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(master, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        role = util.get_role_of_server(master)
        self.assertNotEqual(
            expected, state, 'server%d - state:%s, expected:not %s, role:%s' %
            (master['id'], state, expected, role))
        util.log(
            'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.'
        )

        gw.disconnect()
        return 0
コード例 #12
0
    def test_quorum( self ):
        util.print_frame()

        master, slave1, slave2 = util.get_mss(self.cluster)

        expected = 2
        max_try = 20
        for i in range( 0, max_try ):
            quorum = util.get_quorum( master )
            if quorum == expected:
                break;
            time.sleep( 1 )
        self.assertEquals( quorum, expected,
                           'quorum:%d, expected:%d' % (quorum, expected) )

        ret = testbase.request_to_shutdown_smr( slave1 )
        self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave1['id'] )
        time.sleep( 1 )

        expected = 1
        max_try = 20
        for i in range( 0, max_try ):
            master = util.get_server_by_role( self.cluster['servers'], 'master' )
            quorum = util.get_quorum( master )
            if quorum == expected:
                break;
            time.sleep( 1 )
        self.assertEquals( quorum, expected,
                           'quorum:%d, expected:%d' % (quorum, expected) )

        ret = testbase.request_to_shutdown_smr( slave2 )
        self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % slave2['id'] )
        time.sleep( 1 )

        expected = 0
        max_try = 20
        for i in range( 0, max_try ):
            master = util.get_server_by_role( self.cluster['servers'], 'master' )
            quorum = util.get_quorum( master )
            if quorum == expected:
                break;
            time.sleep( 1 )
        self.assertEquals( quorum, expected,
                           'quorum:%d, expected:%d' % (quorum, expected) )

        # recovery
        ret = testbase.request_to_start_smr( slave1 )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( slave1 )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( slave1 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave1['id']) )
        time.sleep( 1 )

        expected = 1
        max_try = 20
        for i in range( 0, max_try ):
            quorum = util.get_quorum( master )
            if quorum == expected:
                break;
            time.sleep( 1 )
        self.assertEquals( quorum, expected,
                           'quorum:%d, expected:%d' % (quorum, expected) )

        # recovery
        ret = testbase.request_to_start_smr( slave2 )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( slave2 )
        self.assertEqual( ret, 0, 'failed to start redis' )

        ret = testbase.wait_until_finished_to_set_up_role( slave2 )
        self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (slave2['id']) )
        time.sleep( 1 )

        expected = 2
        max_try = 20
        for i in range( 0, max_try ):
            quorum = util.get_quorum( master )
            if quorum == expected:
                break;
            time.sleep( 1 )
        self.assertEquals( quorum, expected,
                           'quorum:%d, expected:%d' % (quorum, expected) )
コード例 #13
0
    def test_7_dirty_network_fi(self):
        util.print_frame()
        clnts = []

        try:
            out = util.sudo('iptables -L')
            util.log('====================================================================')
            util.log('out : %s' % out)
            util.log('out.return_code : %d' % out.return_code)
            util.log('out.stderr : %s' % out.stderr)
            util.log('out.succeeded : %s' % out.succeeded)

            # Add forwarding role
            out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

            cluster_name = 'network_isolation_cluster_1'
            cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
            util.log(util.json_to_str(cluster))

            self.leader_cm = cluster['servers'][0]

            # MGMT
            mgmt_ip = cluster['servers'][0]['real_ip']
            mgmt_port = cluster['servers'][0]['cm_port']

            # Create cluster
            ret = default_cluster.initialize_starting_up_smr_before_redis( cluster, 
                    conf={'cm_context':'applicationContext-fi.xml'})
            self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

            # Print initial state of cluster
            util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
            initial_state = []
            self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

            # Start crc16 client
            for s in cluster['servers']:
                c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['gateway_port'], 3000, verbose=False)
                c.start()
                clnts.append(c)

            # Network isolation test
            cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'qa', 'me', 'yj', 'bj', 'mg'], 
                                                ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1)

            for fi in cmfi:
                # Block network
                util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi))
                ret = block_network(cluster, mgmt_ip, mgmt_port)
                self.assertTrue(ret, '[%s] failed to block network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)

                    state_transition_done = True
                    for s in isolated_states:
                        if s['ip'] != '127.0.0.100':
                            continue

                        if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                            state_transition_done = False

                    if state_transition_done:
                        ok = True
                        break
                    time.sleep(1)
                self.assertTrue(ok, 'Fail, state transition')

                # Fault injection
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), 
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

                # Unblock network
                util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi))
                ret = unblock_network(cluster, mgmt_ip, mgmt_port, None)
                self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                    if ok:
                        break
                    time.sleep(1)
                self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                check_cluster = False

                # 'bj', 'slave'
                if fi[0] == 'bj' and fi[1] == 'slave':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(s1)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'me', 'lconn'
                elif fi[0] == 'me' and fi[1] == 'lconn':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(m)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'qa', 'setquorum'
                elif fi[0] == 'qa' and fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)

                    # shutdown
                    ret = testbase.request_to_shutdown_smr(s1)
                    self.assertEqual(0, ret, '[%s] failed to shutdown smr%d' % (str(fi), s1['id']))
                    ret = testbase.request_to_shutdown_redis(s1)
                    self.assertEqual(0, ret, '[%s] failed to shutdown redis%d' % (str(fi), s1['id']))

                    # Check quorum
                    q = -1
                    for q_cnt in xrange(20):
                        q = util.get_quorum(m)
                        if q == 1:
                            break
                        time.sleep(1)
                    self.assertEquals(1, q, "[%s] check quorum fail." % str(fi))

                    # Modify quorum
                    ret = util.cmd_to_smr_addr(m['ip'], m['smr_mgmt_port'], 'setquorum 0\r\n')
                    self.assertEqual("+OK\r\n", ret, '[%s] "setquorum 0" fail.' % str(fi))

                    # Check quorum
                    q = -1
                    for q_cnt in xrange(20):
                        q = util.get_quorum(m)
                        if q == 1:
                            break
                        time.sleep(1)
                    self.assertEquals(1, q, "[%s] check quorum fail." % str(fi))

                    # recovery
                    ret = testbase.request_to_start_smr(s1)
                    self.assertEqual(0, ret, '[%s] failed to start smr' % str(fi))
                    ret = testbase.request_to_start_redis(s1, max_try=120)
                    self.assertEqual(0, ret, '[%s] failed to start redis' % str(fi))
                    ret = testbase.wait_until_finished_to_set_up_role(s1, 11)
                    self.assertEqual(0, ret, '[%s] failed to role change. smr_id:%d' % (str(fi), s1['id']))

                    check_cluster = True

                # 'setquorum'
                elif fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20)
                    self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret))
                    check_cluster = True

                if check_cluster:
                    # Check cluster state
                    ok = False
                    for i in xrange(20):
                        isolated_states = []
                        ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                        if ok:
                            break
                        time.sleep(1)
                    self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                # Check fault injection
                ok = False
                for i in xrange(10):
                    count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port)
                    if count == 0:
                        ok = True
                        break
                    time.sleep(0.5)
                self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi))

            # Shutdown cluster
            ret = default_cluster.finalize( cluster )
            self.assertEqual(ret, 0, '[%s] failed to TestMaintenance.finalize' % str(fi))

            # Delete forwarding role
            out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)
            out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
            self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

            for c in clnts:
                self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

        finally:
            for c in clnts:
                c.quit()
            for c in clnts:
                c.join()
コード例 #14
0
    def master_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(5)

        # wait for forced master election
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server(s2)
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep(1)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        self.assertEqual(success, True, 'failed to forced master election')

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check if the haning server recovered and joined as a slave
        time.sleep(7)
        role = util.get_role_of_server(m)
        self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave')

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #15
0
    def master_failover_while_hang(self):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        self.failover_while_hang(m)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(m['id'])
        ret = redis1.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))
            util.log(
                'succeeded : check values with set/get operations with pgs%d and pgs%d.'
                % (m['id'], s2['id']))

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #16
0
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # wait for forced master election
        success = False
        master = None
        for i in range(20):
            role = util.get_role_of_server(s1)
            ts = util.get_timestamp_of_pgs(s1)
            if role == c.ROLE_MASTER and ts == s1_ts:
                master = s1
                success = True
                break
            role = util.get_role_of_server(s2)
            ts = util.get_timestamp_of_pgs(s2)
            if role == c.ROLE_MASTER and ts == s2_ts:
                master = s2
                success = True
                break
            role = util.get_role_of_server(m)
            ts = util.get_timestamp_of_pgs(m)
            if role == c.ROLE_MASTER and ts == m_ts:
                master = m
                success = True
                break
            time.sleep(1)

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        self.assertEqual(success, True, 'failed to forced master election')

        servers = [m, s1, s2]
        for s in servers:
            if s != master:
                for i in range(20):
                    role = util.get_role_of_server(s)
                    if role == c.ROLE_SLAVE:
                        success = True
                        break
                    time.sleep(1)
                self.assertEqual(
                    success, True, 'failed to rejoin as a slave, %s:%d' %
                    (s['ip'], s['smr_mgmt_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
コード例 #17
0
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(s1)
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(
            success, True, 'failed to rejoin as a slave. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        success = False
        for i in range(20):
            role = util.get_role_of_server(s2)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(s2)
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(
            success, True, 'failed to rejoin as a slave. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #18
0
    def master_and_slave_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave = smr_mgmt.SMR(s1['id'])
        ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr_master.write('fi delay sleep 1 10000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave.write('fi delay sleep 1 10000\r\n')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        time.sleep(5)

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range(15):
                state = []
                util.check_cluster(self.cluster['cluster_name'],
                                   self.leader_cm['ip'],
                                   self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role == 'M':
                    success = False
                    break
                time.sleep(1)

            util.log('')
            util.log(
                'It expects %s:%d is a slave, it can not transit to a master because it violate copy-quorum. (c:3, q:1, a:1)'
            )
            util.log('')
            util.log_server_state(self.cluster)

            self.assertEqual(success, True, 'failed to check copy-quorum')

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'],
                                        self.leader_cm['ip'],
                                        self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue(ok, 'Cluster state is not normal!')

            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write(cmd)
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis1. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis1(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write(cmd)
                res = redis0.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis0. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #19
0
    def test_5_transfer_pgs_to_another_machine(self):
        util.print_frame()

        self.load_gen_list = {}

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # incrase master generation number
        util.log('failover in order to increase master generation number.')
        max = 0
        for i in range(5):
            key_base = 'key'
            for i in range(max, max+10000):
                cmd = 'set %s%d %d\r\n' % (key_base, i, i)
                gw.write( cmd )
                res = gw.read_until( '\r\n' )
                self.assertEquals( res, '+OK\r\n' )
            max = max + 10000

            m = util.get_server_by_role(self.cluster['servers'], 'master')
            util.log('failover pgs%d' %  m['id'])
            ret = util.failover(m, self.leader_cm)
            self.assertTrue(ret, 'failed to failover pgs%d' % m['id'])

        # start load generator
        util.log("start load_generator")
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port)
            self.load_gen_list[i].start()

        time.sleep(5) # generate load for 5 sec
        util.log("started load_generator")

        m, s1, s2 = util.get_mss(self.cluster)
        servers = [m, s1, s2]

        # bgsave
        for s in servers:
            ret = util.bgsave(s)
            self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id'])

        new_servers = [config.server4, config.server5]

        # add new slaves
        for s in new_servers:
            util.log('delete pgs%d`s check point.' % s['id'])
            util.del_dumprdb(s['id'])

            ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191)
            self.assertEqual(True, ret,
                'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % (
                m['ip'], m['redis_port'], s['id']))

            ret = util.pgs_add(self.cluster, s, self.leader_cm, 0, rm_ckpt=False)
            self.assertEqual(True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id'])
            util.log('succeeeded : add a new slave, pgsid=%d' % s['id'])

            # check consistency
            ok = True
            for j in range(self.max_load_generator):
                if self.load_gen_list[j].isConsistent() == False:
                    ok = False
                    break
            if not ok:
                break;

        for server_to_del in servers:
            for s in servers:
                util.pingpong( s['ip'], s['smr_mgmt_port'] )
            for s in new_servers:
                util.pingpong( s['ip'], s['smr_mgmt_port'] )
            self.__del_server(server_to_del)
            util.log('succeeded : delete pgs%d' % server_to_del['id'])

        new_m = util.get_server_by_role(new_servers, 'master')
        new_s = util.get_server_by_role(new_servers, 'slave')
        self.assertNotEqual( new_m, None, 'master is None.' )
        self.assertNotEqual( new_s, None, 'slave is None.' )

        for s in new_servers:
            util.pingpong( s['ip'], s['smr_mgmt_port'] )

        time.sleep(5) # generate load for 5 sec
        # check consistency of load_generator
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
コード例 #20
0
ファイル: test_pgs_hanging.py プロジェクト: iloview/nbase-arc
    def test_two_slaves_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs( s1 )
        self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1) )

        ts_before2 = util.get_timestamp_of_pgs( s2 )
        self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2) )

        # hang
        smr1 = smr_mgmt.SMR( s1['id'] )
        ret = smr1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr2 = smr_mgmt.SMR( s2['id'] )
        ret = smr2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr1.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr1.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr2.write( 'fi delay sleep 1 8000\r\n' )
        time.sleep( 7 )

        # wait for rejoin as a slave
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s1 )
                if ts_after != -1 and ts_before1 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs( s2 )
                if ts_after != -1 and ts_before2 == ts_after:
                    success = True
                    break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to rejoin as a slave. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #21
0
    def test_7_dirty_network_fi(self):
        util.print_frame()
        clnts = []

        try:
            util.iptables_print_list()

            # Add forwarding role
            self.assertTrue(util.iptables_redirect('A', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')

            cluster_name = 'network_isolation_cluster_1'
            cluster = filter(lambda x: x['cluster_name'] == cluster_name, config.clusters)[0]
            util.log(util.json_to_str(cluster))

            self.leader_cm = cluster['servers'][0]

            # MGMT
            mgmt_ip = cluster['servers'][0]['real_ip']
            mgmt_port = cluster['servers'][0]['cm_port']

            # Create cluster
            conf_checker = default_cluster.initialize_starting_up_smr_before_redis( cluster, 
                    conf={'cm_context':'applicationContext-fi.xml'})
            self.assertIsNotNone(conf_checker, 'failed to initialize cluster')

            # Print initial state of cluster
            util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
            initial_state = []
            self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

            # Start crc16 client
            for s in cluster['servers']:
                c = load_generator_crc16.Crc16Client(s['id'], s['ip'], s['redis_port'], 600, verbose=False)
                c.start()
                clnts.append(c)

            # Network isolation test
            cmfi = fi_confmaster.ConfmasterWfFi(['ra', 'me', 'yj', 'bj', 'mg'], 
                                                ['lconn', 'slave', 'master', 'setquorum'], [True, False], 1)

            for fi in cmfi:
                # Block network
                util.log('\n\n\n ### BLOCK NETWORK, %s ### ' % str(fi))
                ret = block_network(cluster, mgmt_ip, mgmt_port)
                self.assertTrue(ret, '[%s] failed to block network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)

                    state_transition_done = True
                    for s in isolated_states:
                        if s['ip'] != '127.0.0.100':
                            continue

                        if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                            state_transition_done = False

                    if state_transition_done:
                        ok = True
                        break
                    time.sleep(1)
                self.assertTrue(ok, 'Fail, state transition')

                # Fault injection
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 1, mgmt_ip, mgmt_port), 
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

                # Unblock network
                util.log('\n\n\n ### UNBLOCK NETWORK, %s ### ' % str(fi))
                ret = unblock_network(cluster, mgmt_ip, mgmt_port, None)
                self.assertTrue(ret, '[%s] failed to unblock network.' % str(fi))

                for i in xrange(4):
                    util.log('waiting... %d' % (i + 1))
                    time.sleep(1)

                # Check cluster state
                ok = False
                for i in xrange(10):
                    isolated_states = []
                    ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                    if ok:
                        break
                    time.sleep(1)
                self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                check_cluster = False

                # 'bj', 'slave'
                if fi[0] == 'bj' and fi[1] == 'slave':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(s1)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'me', 'lconn'
                elif fi[0] == 'me':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.role_lconn(m)
                    self.assertEqual("+OK\r\n", ret, '[%s] role lconn fail.' % str(fi))
                    check_cluster = True
                # 'setquorum'
                elif fi[1] == 'setquorum':
                    m, s1, s2 = util.get_mss(cluster)
                    ret = util.cmd_to_smr_addr(s1['ip'], s1['smr_mgmt_port'], 'fi delay sleep 1 8000\r\n', timeout=20)
                    self.assertEqual("+OK\r\n", ret, '[%s] "fi delay sleep 1 8000" fail. ret: "%s"' % (str(fi), ret))
                    check_cluster = True

                if check_cluster:
                    # Check cluster state
                    ok = False
                    for i in xrange(20):
                        isolated_states = []
                        ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                        if ok:
                            break
                        time.sleep(1)
                    self.assertTrue(ok, '[%s] Fail. unstable cluster.' % str(fi))

                # Check fault injection
                ok = False
                for i in xrange(10):
                    count = fi_confmaster.fi_count(fi, mgmt_ip, mgmt_port)
                    if count == 0:
                        ok = True
                        break
                    time.sleep(0.5)
                self.assertTrue(ok, "[%s] fail. failt injection had not been triggered." % str(fi))

                for c in clnts:
                    self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

            for c in clnts:
                self.assertTrue(c.is_consistency(), '[%s] data consistency error!' % str(fi))

            # Go back to initial configuration
            cmfi.init()
            for fi in cmfi:
                try:
                    self.assertTrue(fi_confmaster.fi_add(fi, 0, mgmt_ip, mgmt_port),
                            "Confmaster command fail. fi: %s" % str(fi))
                except ValueError as e:
                    self.fail("Confmaster command error. cmd: \"%s\", reply: \"%s\"" % (cmd, reply))

            # Wait until workflows done
            ret = util.await(60, True)(
                    lambda cinfo: cinfo['wf'] == 0,
                    lambda : util.cluster_info(mgmt_ip, mgmt_port, cluster['cluster_name']))
            self.assertTrue(ret, 'There are still some workflows.')

            self.assertTrue(conf_checker.final_check())

            # Shutdown cluster
            default_cluster.finalize(cluster)

        finally:
            for c in clnts:
                c.quit()
            for c in clnts:
                c.join()

            # Delete forwarding role
            self.assertTrue(util.iptables_redirect('D', '127.0.0.100', '127.0.0.1'), 'add a forwarding role to iptables fail.')
コード例 #22
0
ファイル: test_pgs_hanging.py プロジェクト: iloview/nbase-arc
    def master_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        time.sleep( 5 )

        # wait for forced master election
        success = False
        for i in range( 20 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server( s2 )
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep( 1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        self.assertEqual( success, True, 'failed to forced master election' )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )

        # check if the haning server recovered and joined as a slave
        time.sleep( 7 )
        role = util.get_role_of_server( m )
        self.assertEqual( role, c.ROLE_SLAVE, 'failed to join as a slave' )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #23
0
    def test_quorum_with_left_pgs( self ):
        util.print_frame()

        # start load generators
        load_gen_list = {}
        for i in range( len(self.cluster['servers']) ):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id'])
        ret = util.cm_command( self.leader_cm['ip'], self.leader_cm['cm_port'], cmd )
        jobj = json.loads(ret)
        self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual(2, quorum_of_haning_master,
                          'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master )

        # check if pgs is removed
        r = util.get_role_of_server(m)
        if r != c.ROLE_MASTER:
            success = False
            for try_cnt in range( 10 ):
                redis = redis_mgmt.Redis( m['id'] )
                ret = redis.connect( m['ip'], m['redis_port'] )
                self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
                util.log( 'succeeded : connect to smr%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )

                redis.write( 'info stats\r\n' )
                for i in range( 6 ):
                    redis.read_until( '\r\n' )
                res = redis.read_until( '\r\n' )
                self.assertNotEqual( res, '', 'failed : get reply of "info stats" from redis%d(%s:%d)' % (m['id'], m['ip'], m['redis_port']) )
                util.log( 'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"' % (m['id'], m['ip'], m['redis_port'], res[:-2]) )
                no = int( res.split(':')[1] )
                if no <= 100:
                    success = True
                    break

                time.sleep( 1 )

            self.assertEquals( success, True, 'failed : pgs does not removed.' )
        util.log( 'pgs is removed' )

        # check states of all pgs in pg
        for i in xrange(10):
            for s in self.cluster['servers']:
                smr_info = util.get_smr_info( s, self.leader_cm )
                cc_role = smr_info['smr_Role']
                cc_hb = smr_info['hb']
                if cc_hb == 'N':
                    continue

                real_role = util.get_role_of_server( s )
                real_role = util.roleNumberToChar( real_role )
                if real_role != cc_role:
                    time.sleep(0.5)
                    continue

        for s in self.cluster['servers']:
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue

            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual(2, quorum_of_haning_master,
                          'invalid quorum of left master, expected:%d, but:%d' % (2, quorum_of_haning_master) )
        util.log( 'succeeded : quorum of left master=%d' % quorum_of_haning_master )

        # 'role lconn' to master
        cmd = 'role lconn\r\n'
        ret = util.cmd_to_smr( m, cmd )
        self.assertEqual( ret, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )
        util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]) )

        # wait for master election
        success = False
        new_master = None
        for i in range( 10 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to elect new master' )
        util.log( 'succeeded : elect new master, master_id=%d' % new_master['id'] )

        time.sleep( 1 )
        # check the numbers of master, slave, and lconn
        cnt_master = 0
        cnt_slave = 0
        cnt_lconn = 0
        for s in self.cluster['servers']:
            role = util.get_role_of_server( s )
            if role == c.ROLE_MASTER:
                cnt_master = cnt_master + 1
            elif role == c.ROLE_SLAVE:
                cnt_slave = cnt_slave + 1
            elif role == c.ROLE_LCONN:
                cnt_lconn = cnt_lconn + 1
        self.assertEqual( cnt_master, 1, 'failed : the number of master is %s, expected 1' % cnt_master )
        self.assertEqual( cnt_slave, 1, 'failed : the number of slave is %s, expected 1' % cnt_slave )
        self.assertEqual( cnt_lconn, 1, 'failed : the number of lconn is %s, expected 1' % cnt_lconn )

        # check states of all pgs in pg
        for s in self.cluster['servers']:
            real_role = util.get_role_of_server( s )
            real_role = util.roleNumberToChar( real_role )
            smr_info = util.get_smr_info( s, self.leader_cm )
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue
            self.assertEqual( real_role, cc_role,
                              'failed : each role is difference, real=%s, cc=%s' % (real_role, cc_role) )
            util.log( 'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s' % (real_role, cc_role) )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( 1, quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (1, quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        # shutdown load generators
        for i in range( len(load_gen_list) ):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        # Go back to initial configuration
        self.assertTrue(util.pgs_join(self.leader_cm['ip'], self.leader_cm['cm_port'], m['cluster_name'], m['id']),
                'failed to recover pgs, (pgs_join)')

        return 0
コード例 #24
0
    def test_quorum_with_left_pgs(self):
        util.print_frame()

        # start load generators
        load_gen_list = {}
        for i in range(len(self.cluster['servers'])):
            server = self.cluster['servers'][i]
            load_gen = load_generator.LoadGenerator(server['id'], server['ip'],
                                                    server['gateway_port'])
            load_gen.start()
            load_gen_list[i] = load_gen

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        # detach pgs from cluster
        cmd = 'pgs_leave %s %d forced\r\n' % (m['cluster_name'], m['id'])
        ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'],
                              cmd)
        jobj = json.loads(ret)
        self.assertEqual(
            jobj['msg'], '+OK',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        # check quorum policy
        quorum_of_haning_master = util.get_quorum(m)
        self.assertEqual(
            2, quorum_of_haning_master,
            'invalid quorum of left master, expected:%d, but:%d' %
            (2, quorum_of_haning_master))
        util.log('succeeded : quorum of left master=%d' %
                 quorum_of_haning_master)

        # check if pgs is removed
        r = util.get_role_of_server(m)
        if r != c.ROLE_MASTER:
            success = False
            for try_cnt in range(10):
                redis = redis_mgmt.Redis(m['id'])
                ret = redis.connect(m['ip'], m['redis_port'])
                self.assertEquals(
                    ret, 0, 'failed : connect to smr%d(%s:%d)' %
                    (m['id'], m['ip'], m['redis_port']))
                util.log('succeeded : connect to smr%d(%s:%d)' %
                         (m['id'], m['ip'], m['redis_port']))

                redis.write('info stats\r\n')
                for i in range(6):
                    redis.read_until('\r\n')
                res = redis.read_until('\r\n')
                self.assertNotEqual(
                    res, '',
                    'failed : get reply of "info stats" from redis%d(%s:%d)' %
                    (m['id'], m['ip'], m['redis_port']))
                util.log(
                    'succeeded : get reply of "info stats" from redis%d(%s:%d), reply="%s"'
                    % (m['id'], m['ip'], m['redis_port'], res[:-2]))
                no = int(res.split(':')[1])
                if no <= 100:
                    success = True
                    break

                time.sleep(1)

            self.assertEquals(success, True, 'failed : pgs does not removed.')
        util.log('pgs is removed')

        # check states of all pgs in pg
        for i in xrange(10):
            for s in self.cluster['servers']:
                smr_info = util.get_smr_info(s, self.leader_cm)
                cc_role = smr_info['smr_Role']
                cc_hb = smr_info['hb']
                if cc_hb == 'N':
                    continue

                real_role = util.get_role_of_server(s)
                real_role = util.roleNumberToChar(real_role)
                if real_role != cc_role:
                    time.sleep(0.5)
                    continue

        for s in self.cluster['servers']:
            smr_info = util.get_smr_info(s, self.leader_cm)
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue

            real_role = util.get_role_of_server(s)
            real_role = util.roleNumberToChar(real_role)
            self.assertEqual(
                real_role, cc_role,
                'failed : each role is difference, real=%s, cc=%s' %
                (real_role, cc_role))
            util.log(
                'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s'
                % (real_role, cc_role))

        # check quorum policy
        quorum_of_haning_master = util.get_quorum(m)
        self.assertEqual(
            2, quorum_of_haning_master,
            'invalid quorum of left master, expected:%d, but:%d' %
            (2, quorum_of_haning_master))
        util.log('succeeded : quorum of left master=%d' %
                 quorum_of_haning_master)

        # 'role lconn' to master
        cmd = 'role lconn\r\n'
        ret = util.cmd_to_smr(m, cmd)
        self.assertEqual(
            ret, '+OK\r\n',
            'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))
        util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2]))

        # wait for master election
        success = False
        new_master = None
        for i in range(10):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server(s2)
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to elect new master')
        util.log('succeeded : elect new master, master_id=%d' %
                 new_master['id'])

        time.sleep(1)
        # check the numbers of master, slave, and lconn
        cnt_master = 0
        cnt_slave = 0
        cnt_lconn = 0
        for s in self.cluster['servers']:
            role = util.get_role_of_server(s)
            if role == c.ROLE_MASTER:
                cnt_master = cnt_master + 1
            elif role == c.ROLE_SLAVE:
                cnt_slave = cnt_slave + 1
            elif role == c.ROLE_LCONN:
                cnt_lconn = cnt_lconn + 1
        self.assertEqual(
            cnt_master, 1,
            'failed : the number of master is %s, expected 1' % cnt_master)
        self.assertEqual(
            cnt_slave, 1,
            'failed : the number of slave is %s, expected 1' % cnt_slave)
        self.assertEqual(
            cnt_lconn, 1,
            'failed : the number of lconn is %s, expected 1' % cnt_lconn)

        # check states of all pgs in pg
        for s in self.cluster['servers']:
            real_role = util.get_role_of_server(s)
            real_role = util.roleNumberToChar(real_role)
            smr_info = util.get_smr_info(s, self.leader_cm)
            cc_role = smr_info['smr_Role']
            cc_hb = smr_info['hb']
            if cc_hb == 'N':
                continue
            self.assertEqual(
                real_role, cc_role,
                'failed : each role is difference, real=%s, cc=%s' %
                (real_role, cc_role))
            util.log(
                'succeeded : a role of real pgs is the same with a role in cc, real=%s, cc=%s'
                % (real_role, cc_role))

        # check quorum policy
        quorum_of_new_master = util.get_quorum(new_master)
        self.assertNotEqual(None, quorum_of_new_master,
                            'failed : find new master')
        self.assertEqual(
            1, quorum_of_new_master,
            'invalid quorum of new master, expected:%d, but:%d' %
            (1, quorum_of_new_master))
        util.log('succeeded : quorum of new master=%d' % quorum_of_new_master)

        # shutdown load generators
        for i in range(len(load_gen_list)):
            load_gen_list[i].quit()
            load_gen_list[i].join()

        return 0
コード例 #25
0
ファイル: test_pgs_hanging.py プロジェクト: iloview/nbase-arc
    def master_and_slave_hang( self ):
        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )

        smr_master.write( 'fi delay sleep 1 10000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave.write( 'fi delay sleep 1 10000\r\n' )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        time.sleep( 5 )

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range( 15 ):
                state = []
                util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep( 1 )

            util.log( '' )
            util.log( 'It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2' )
            util.log( '' )
            util.log_server_state( self.cluster )

            self.assertEqual( success, True, 'failed to check copy-quorum' )

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue( ok, 'Cluster state is not normal!' )

            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write( cmd )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port']) )

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range( 10000, 20000 ):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write( cmd )
                res = redis0.read_until( '\r\n' )
                self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')

        return 0
コード例 #26
0
    def deprecated_test_5_PGS_commit_is_greater_than_PG_commit( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # initial data
        util.put_some_data(self.cluster)

        master, s1, s2 = util.get_mss(self.cluster)

        server_to_join = [s1, s2]
        # shutdown slaves
        for i in range(0, 2):
            ret = testbase.request_to_shutdown_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id'])
            util.log('succeeded to shutdown smr%d' % server_to_join[i]['id'])

            ret = testbase.request_to_shutdown_redis( server_to_join[i] )
            self.assertEquals( ret, 0, 'failed to shutdown redis' )
            util.log('succeeded to shutdown redis%d' % server_to_join[i]['id'])

            # check state F
            max_try = 20
            expected = 'F'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected) )

        # put more data
        util.put_some_data(self.cluster, 10, 256)

        # bgsave
        ret = util.bgsave(master)
        self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id'])

        # shutdown master
        ret = testbase.request_to_shutdown_smr( master )
        self.assertEqual( ret, 0, 'failed to shutdown smr' )
        util.log('succeeded to shutdown master smr, id=%d' % master['id'])
        ret = testbase.request_to_shutdown_redis( master )
        self.assertEquals( ret, 0, 'failed to shutdown redis' )
        util.log('succeeded to shutdown master redis, id=%d' % master['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        self.assertEquals( expected , state,
                           'server%d - state:%s, expected:%s' % (master['id'], state, expected) )

        # recovery slaves
        for i in range(0, 2):
            ret = testbase.request_to_start_smr( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start smr' )

            ret = testbase.request_to_start_redis( server_to_join[i] )
            self.assertEqual( ret, 0, 'failed to start redis' )

            ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10 )
            self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id']) )

            # check state N
            max_try = 20
            expected = 'N'
            for j in range( 0, max_try):
                state = util.get_smr_state( server_to_join[i], self.leader_cm )
                if expected == state:
                    break;
                time.sleep( 1 )
            role = util.get_role_of_server( server_to_join[i] )
            self.assertEquals( expected , state,
                               'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role) )

        # set value
        s = random.choice(server_to_join)
        redis = redis_mgmt.Redis( ['id'] )
        ret = redis.connect( s['ip'], s['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis' )

        key_base = 'key_test'
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (key_base, i, i)
            redis.write( cmd )
            res = redis.read_until( '\r\n' )
            self.assertEquals( res, '+OK\r\n' )
        redis.disconnect()

        for i in range(0, 2):
            redis = redis_mgmt.Redis( server_to_join[i]['id'] )
            ret = redis.connect( server_to_join[i]['ip'], server_to_join[i]['redis_port'] )
            self.assertEquals( ret, 0, 'failed to connect to redis' )

            # check value
            for j in range(0, 10000):
                cmd = 'get %s%d\r\n' % (key_base, j)
                redis.write( cmd )
                redis.read_until( '\r\n'  )
                response = redis.read_until( '\r\n'  )
                self.assertEqual( response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j) )

        # try to recover master, but failed
        ret = testbase.request_to_start_smr( master )
        self.assertEqual( ret, 0, 'failed to start smr' )

        ret = testbase.request_to_start_redis( master, False )
        self.assertEqual( ret, 0, 'failed to start redis' )

        max_try = 3
        expected = 'N'
        for i in range( 0, max_try):
            state = util.get_smr_state( master, self.leader_cm )
            if expected == state:
                break;
            time.sleep( 1 )
        role = util.get_role_of_server( master )
        self.assertNotEqual( expected, state,
                             'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role) )
        util.log('success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.')

        gw.disconnect()
        return 0
コード例 #27
0
ファイル: test_pgs_hanging.py プロジェクト: iloview/nbase-arc
    def test_all_pgs_hang( self ):
        util.print_frame()

        self.setup_test_cluster( self.cluster_3copy )

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        # hang
        smr_master = smr_mgmt.SMR( m['id'] )
        ret = smr_master.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr_slave1 = smr_mgmt.SMR( s1['id'] )
        ret = smr_slave1.connect( s1['ip'], s1['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port']) )
        smr_slave2 = smr_mgmt.SMR( s2['id'] )
        ret = smr_slave2.connect( s2['ip'], s2['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port']) )

        m_ts = util.get_timestamp_of_pgs( m )
        s1_ts = util.get_timestamp_of_pgs( s1 )
        s2_ts = util.get_timestamp_of_pgs( s2 )

        smr_master.write( 'fi delay sleep 1 8000\r\n' )
        reply = smr_master.read_until( '\r\n', 1 )
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.' )

        smr_slave1.write( 'fi delay sleep 1 8000\r\n' )
        smr_slave2.write( 'fi delay sleep 1 8000\r\n' )

        time.sleep( 10 )

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # set values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0 .write( cmd )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        redis2 = redis_mgmt.Redis( s2['id'] )
        ret = redis2.connect( s2['ip'], s2['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

        # check new values (m)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i) )

        # check new values (s1)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write( cmd )
            redis1.read_until( '\r\n' )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i) )

        # check new values (s2)
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write( cmd )
            redis2.read_until( '\r\n' )
            res = redis2.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i) )

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
コード例 #28
0
    def elect_master_randomly( self ):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway( '0' )
        gw.connect( ip, port )
        for i in range( 0, 1000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2]) )

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append( server['id'] )

        for try_cnt in range( 30 ):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
            util.log( 'master id : %d' % m['id'] )

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove( m['id'] )

            smr = smr_mgmt.SMR( m['id'] )
            ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
            self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
            cmd = 'role lconn\r\n'
            smr.write( cmd )
            reply = smr.read_until( '\r\n' )
            self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )
            util.log( 'succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]) )

            # wait until role-change is finished
            for role_change_try_cnt in range( 5 ):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server( server )
                    real_role = util.roleNumberToChar( real_role )
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break;
                time.sleep( 1 )

            # check the number of master and slave
            self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave) )
            util.log( 'succeeded : the number of master is 1 and the number of slave is 2' )

            # check states of all pgs in pg
            for try_cnt in range( 3 ):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server( s )
                    real_role = util.roleNumberToChar( real_role )
                    smr_info = util.get_smr_info( s, self.leader_cm )
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )
                    else:
                        util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb) )

                if ok == False:
                    time.sleep( 0.5 )
                else:
                    break

            self.assertTrue( ok, 'failed : role check' )

            if len( server_ids ) == 0:
                util.log( 'succeeded : all smrs have been as a master' )
                return 0

        self.assertEqual( 0, len( server_ids ) , 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))  )
        return 0
コード例 #29
0
ファイル: test_pgs_hanging.py プロジェクト: iloview/nbase-arc
    def slave_failover_while_hang( self ):
        util.print_frame()

        # get gateway info
        ip, port = util.get_rand_gateway( self.cluster )
        gw = gateway_mgmt.Gateway( self.cluster['servers'][0]['id'] )
        ret = gw.connect( ip, port )
        self.assertEqual( ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port) )

        # set values
        for i in range( 0, 10000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write( cmd )
            res = gw.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res) )

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )
            self.assertNotEqual( s2, None, 'slave2 is None.' )
        else:
            m, s1 = util.get_mss( self.cluster )
            self.assertNotEqual( m, None, 'master is None.' )
            self.assertNotEqual( s1, None, 'slave1 is None.' )

        util.log( 'server state before hang' )
        util.log_server_state( self.cluster )

        self.failover_while_hang( s1 )

        util.log( 'server state transition after hang' )
        util.log_server_state( self.cluster )

        redis1 = redis_mgmt.Redis( s1['id'] )
        ret = redis1.connect( s1['ip'], s1['redis_port'] )
        self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port']) )

        # set new values
        for i in range( 10000, 20000 ):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write( cmd )
            res = redis1.read_until( '\r\n' )
            self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res) )

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis( s2['id'] )
            ret = redis2.connect( s2['ip'], s2['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port']) )

            # check new values
            for i in range( 10000, 20000 ):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write( cmd )
                redis2.read_until( '\r\n' )
                res = redis2.read_until( '\r\n' )
                self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i) )
            util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (s1['id'], s2['id']) )

        redis0 = redis_mgmt.Redis( m['id'] )
        ret = redis0.connect( m['ip'], m['redis_port'] )
        self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']) )

        # check new values
        for i in range( 10000, 20000 ):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write( cmd )
            redis0.read_until( '\r\n' )
            res = redis0.read_until( '\r\n' )
            self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i) )

        # check consistency
        self.assertEqual(util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail')
        return 0
コード例 #30
0
    def test_5_mgmt_is_isolated_with_master_failover(self):
        util.print_frame()

        out = util.sudo('iptables -L')
        util.log('====================================================================')
        util.log('out : %s' % out)
        util.log('out.return_code : %d' % out.return_code)
        util.log('out.stderr : %s' % out.stderr)
        util.log('out.succeeded : %s' % out.succeeded)

        # Add forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -A OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -A PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'add a forwarding role to iptables fail. output:%s' % out)

        cluster = filter(lambda x: x['cluster_name'] == 'network_isolation_cluster_1', config.clusters)[0]
        util.log(util.json_to_str(cluster))

        self.leader_cm = cluster['servers'][0]

        # MGMT
        mgmt_ip = cluster['servers'][0]['real_ip']
        mgmt_port = cluster['servers'][0]['cm_port']

        # Create cluster
        ret = default_cluster.initialize_starting_up_smr_before_redis( cluster )
        self.assertEqual(0, ret, 'failed to TestMaintenance.initialize')

        # Print initial state of cluster
        util.log('\n\n\n ### INITIAL STATE OF CLUSTER ### ')
        initial_state = []
        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, initial_state, check_quorum=True), 'failed to check cluster state')

        # Network isolation test
        for loop_cnt in range(3):
            master, slave1, slave2 = util.get_mss(cluster)
            self.assertNotEquals(master, None, 'there is no master')
            self.assertNotEquals(slave1, None, 'there is no slave1')
            self.assertNotEquals(slave2, None, 'there is no slave2')

            # Block network
            util.log('\n\n\n ### BLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -A OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'add a bloking role to iptables fail. output:%s' % out)

            for i in range(4):
                util.log('waiting... %d' % (i + 1))
                time.sleep(1)

            # Check cluster state
            ok = False
            for i in range(10):
                isolated_states = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, isolated_states, check_quorum=True)
                time.sleep(1)

                state_transition_done = True
                for s in isolated_states:
                    if s['ip'] != '127.0.0.100':
                        continue

                    if s['active_role'] != '?' or s['mgmt_role'] != 'N':
                        state_transition_done = False

                if state_transition_done :
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state transition')

            # Shutdown master
            util.log( 'shutdown pgs%d while hanging.' % master['id'] )
            ret = testbase.request_to_shutdown_smr( master )
            self.assertEqual( ret, 0, 'failed to shutdown smr. id:%d' % master['id'] )
            ret = testbase.request_to_shutdown_redis( master )
            self.assertEqual( ret, 0, 'failed to shutdown redis. id:%d' % master['id'] )

            # Check state F
            max_try = 20
            expected = 'F'
            for i in range( 0, max_try):
                util.log('MGMT_IP:%s, MGMT_PORT:%d' % (mgmt_ip, mgmt_port))
                state = util._get_smr_state( master['id'], cluster['cluster_name'], mgmt_ip, mgmt_port )
                if expected == state:
                    break;
                time.sleep( 1 )
            self.assertEqual( expected , state,
                               'master%d - state:%s, expected:%s' % (master['id'], state, expected) )
            util.log( 'succeeded : pgs%d state changed to F.' % master['id'] )

            # Unblock network
            util.log('\n\n\n ### UNBLOCK NETWORK, %d ### ' % loop_cnt)
            out = util.sudo('iptables -D OUTPUT -d 127.0.0.100 -j DROP')
            self.assertTrue(out.succeeded, 'delete a bloking role to iptables fail. output:%s' % out)

            # Check cluster state
            ok = False
            for i in range(7):
                final_state = []
                util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, final_state, check_quorum=True)

                state_consistency = True
                for s in final_state:
                    if s['pgs_id'] == master['id']:
                        continue

                    if s['active_role'] != s['mgmt_role']:
                        state_consistency = False

                if state_consistency:
                    ok = True
                    break
                time.sleep(1)
            self.assertTrue(ok, 'Fail, state consistency')

            # Recovery
            util.log( 'restart pgs%d.' % master['id'] )
            ret = testbase.request_to_start_smr( master )
            self.assertEqual( ret, 0, 'failed to start smr. id:%d' % master['id'] )

            ret = testbase.request_to_start_redis( master )
            self.assertEqual( ret, 0, 'failed to start redis. id:%d' % master['id'] )

            wait_count = 20
            ret = testbase.wait_until_finished_to_set_up_role( master, wait_count )
            self.assertEqual( ret, 0, 'failed to role change. smr_id:%d' % (master['id']) )

            redis = redis_mgmt.Redis( master['id'] )
            ret = redis.connect( master['ip'], master['redis_port'] )
            self.assertEqual( ret, 0, 'failed to connect to redis' )

            ok = False
            for i in xrange(5):
                ok = util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True)
                if ok:
                    break
                else:
                    time.sleep(1)

            self.assertTrue(ok, 'failed to check cluster state')

        # Check state
        self.assertNotEqual(initial_state, None, 'initial_state is None')
        self.assertNotEqual(final_state, None, 'final_state is None')

        initial_state = sorted(initial_state, key=lambda x: int(x['pgs_id']))
        final_state = sorted(final_state, key=lambda x: int(x['pgs_id']))
        for i in range(0, 3):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertNotEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        for i in range(3, 6):
            msg = 'ts (%d)%d -> (%d)%d' % (initial_state[i]['pgs_id'], initial_state[i]['active_ts'], final_state[i]['pgs_id'], final_state[i]['active_ts'])
            util.log(msg)
            self.assertEqual(initial_state[i]['active_ts'], final_state[i]['active_ts'], msg)

        self.assertTrue(util.check_cluster(cluster['cluster_name'], mgmt_ip, mgmt_port, check_quorum=True), 'failed to check cluster state')

        # Shutdown cluster
        ret = default_cluster.finalize( cluster )
        self.assertEqual(ret, 0, 'failed to TestMaintenance.finalize')

        # Delete forwarding role (127.0.0.100 -> 127.0.0.1)
        out = util.sudo('iptables -t nat -D OUTPUT -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)

        out = util.sudo('iptables -t nat -D PREROUTING -d 127.0.0.100 -p tcp -j DNAT --to-destination 127.0.0.1')
        self.assertTrue(out.succeeded, 'delete a forwarding role to iptables fail. output:%s' % out)