def test_quorum_policy_of_hanging_master( self ):
        util.print_frame()

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss( self.cluster )
        self.assertNotEqual( m, None, 'master is None.' )
        self.assertNotEqual( s1, None, 'slave1 is None.' )
        self.assertNotEqual( s2, None, 'slave2 is None.' )

        # hang
        smr = smr_mgmt.SMR( m['id'] )
        ret = smr.connect( m['ip'], m['smr_mgmt_port'] )
        self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port']) )
        smr.write( 'fi delay sleep 1 15000\r\n' )
        time.sleep( 5 )

        # wait for forced master election
        success = False
        new_master = None
        for i in range( 7 ):
            role = util.get_role_of_server( s1 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s1
                break
            role = util.get_role_of_server( s2 )
            if role == c.ROLE_MASTER:
                success = True
                new_master = s2
                break
            time.sleep( 1 )
        self.assertEqual( success, True, 'failed to forced master election' )

        # shutdown confmaster
        for server in self.cluster['servers']:
            util.shutdown_cm( server['id'] )

        # wait until hanging master wake up
        time.sleep( 5 )

        # check quorum policy
        quorum_of_haning_master = util.get_quorum( m )
        self.assertEqual( self.quorum_policy[1], quorum_of_haning_master,
                          'invalid quorum of haning master, expected:%d, but:%d' %( self.quorum_policy[1], quorum_of_haning_master) )
        util.log( 'succeeded : quorum of haning master=%d' % quorum_of_haning_master )

        # check quorum policy
        quorum_of_new_master = util.get_quorum( new_master )
        self.assertNotEqual( None, quorum_of_new_master, 'failed : find new master' )
        self.assertEqual( self.quorum_policy[1], quorum_of_new_master ,
                          'invalid quorum of new master, expected:%d, but:%d' % (self.quorum_policy[1], quorum_of_new_master) )
        util.log( 'succeeded : quorum of new master=%d' % quorum_of_new_master )

        return 0
Exemple #2
0
    def getseq_log(self, s):
        smr = smr_mgmt.SMR(s['id'])
        try:
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            if ret != 0:
                return

            smr.write('getseq log\r\n')
            response = smr.read_until('\r\n', 1)
            util.log('getseq log (pgs%d) = %s' % (s['id'], response[:-2]))
            smr.disconnect()
        except IOError:
            pass
Exemple #3
0
    def test_fd_leak(self):
        util.print_frame()

        server = util.get_server_by_role_and_pg(self.cluster['servers'], 'master', 0)
        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')
        smr = smr_mgmt.SMR(server['id'])
        ret = smr.connect(server['ip'], server['smr_mgmt_port'])
        self.assertEquals(ret, 0, 'failed to connect to smr')

        redis.write('info server\r\n')
        res = redis.read_until('process_id:')
        res = redis.read_until('\r\n')
        redis.write('quit\r\n')

        pid = copy.copy(res[:-2])
        num1 = self.numOpenFds(pid)
        print "Initial : Open Fds: %s" % self.numOpenFds(pid)

        smr.write('fi delay sleep 1 1000000\r\n')
        smr.read_until('\r\n')

        for i in range(5):
            ret = redis.connect(server['ip'], server['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')
            redis.write('ping\r\n')
            res = redis.read_until('\r\n', 1)
            print "Try Ping : Open Fds: %s" % self.numOpenFds(pid)
            redis.disconnect()
            print "Disconnect : Open Fds: %s" % self.numOpenFds(pid)

            ret = redis.connect(server['ip'], server['redis_port'])
            self.assertEquals(ret, 0, 'failed to connect to redis')
            redis.write('*1\r\nasdf\r\n')
            time.sleep(1)
            res = redis.read_until('\r\n', 1)
            print "Protocol Error : Open Fds: %s" % self.numOpenFds(pid)
            redis.disconnect()
            print "Disconnect : Open Fds: %s" % self.numOpenFds(pid)

        print "End : Open Fds: %s" % self.numOpenFds(pid)

        num2 = self.numOpenFds(pid)
        self.assertEquals(num1, num2)

        # Go back to initial configuration
        self.assertTrue(util.shutdown_pgs(server, self.cluster['servers'][0]),
                'recover pgs fail. (shutdown_pgs)')
        self.assertTrue(util.recover_pgs(server, self.cluster['servers'][0]),
                'recover pgs fail. (shutdown_pgs)')
Exemple #4
0
def wait_until_finished_to_set_up_role( server, max_try = 20 ):
    smr = smr_mgmt.SMR( server['id'] )
    if smr.connect( server['ip'], server['smr_mgmt_port'] ) is not 0:
        util.log('wait_until_finished_to_set_up_rol(), eserver%d(%s:%d) is stopped.' % (server['id'], server['ip'], server['smr_mgmt_port']))
        return -1

    try_count = 0
    for try_count in range( 0, max_try ):
        time.sleep( 0.2 )

        smr.write( 'ping\r\n' )
        response = smr.read_until( '\r\n' )
        data = str.split( response, ' ' )
        role = data[1]
        if role == c.ROLE_MASTER or role == c.ROLE_SLAVE:
            return 0

    util.log('failed to wait until finished to set up a role for %s:%d' % (server['ip'], server['smr_mgmt_port']))
    return -1
    def role_change_with_hanging_pgs(self, hanging_servers, running_servers,
                                     target_id, master):
        util.log('hanging_servers:%s' % hanging_servers)
        util.log('running_servers:%s' % running_servers)
        util.log('target_id:%s' % target_id)

        # Initial data
        util.put_some_data(self.cluster, 3, 10)

        util.log("States (before role change)")
        util.log_server_state(self.cluster)

        # Get old timestamp
        old_timestamps = {}
        for s in self.cluster['servers']:
            ts = util.get_timestamp_of_pgs(s)
            old_timestamps[s['id']] = ts

        # hang
        for s in hanging_servers:
            smr = smr_mgmt.SMR(s['id'])
            ret = smr.connect(s['ip'], s['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (s['ip'], s['smr_mgmt_port']))
            util.log("PGS '%d' hang" % s['id'])

            smr.write('fi delay sleep 1 13000\r\n')
            reply = smr.read_until('\r\n', 1)
            if reply != None and reply.find('-ERR not supported') != -1:
                self.assertEqual(
                    0, 1, 'make sure that smr has compiled with gcov option.')
            smr.disconnect()

        # Role change
        master_id = util.role_change(self.leader_cm,
                                     self.cluster['cluster_name'], target_id)
        self.assertEqual(master_id, -1,
                         'We expected that role_change failed, but success')

        # Check rollback - check quorum
        if master not in hanging_servers:
            expected = 2
            ok = self.__check_quorum(master, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check rollback - get new timestamp
        new_timestamps_in_hang = {}
        for s in running_servers:
            ts = util.get_timestamp_of_pgs(s)
            new_timestamps_in_hang[s['id']] = ts

        # Check rollback - compare old timestamps and new timestamps
        for s in running_servers:
            old_ts = old_timestamps[s['id']]
            new_ts = new_timestamps_in_hang[s['id']]
            self.assertEqual(
                old_ts, new_ts,
                'Timestamp of a running server has changed. %d->%d' %
                (old_ts, new_ts))

        time.sleep(16)
        util.log("States (after role change)")
        util.log_server_state(self.cluster)

        self.load_gen_list = {}

        # Start load generator
        for i in range(self.max_load_generator):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen = load_generator.LoadGenerator(i, ip, port)
            load_gen.start()
            self.load_gen_list[i] = load_gen

        # Check quorum
        if master in hanging_servers:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')

            expected = 2
            ok = self.__check_quorum(m, expected)
            self.assertTrue(ok,
                            'rollback quorum fail. expected:%s' % (expected))

        # Check cluster state
        normal_state = False
        for i in xrange(20):
            normal_state = util.check_cluster(self.cluster['cluster_name'],
                                              self.leader_cm['ip'],
                                              self.leader_cm['cm_port'],
                                              check_quorum=True)
            if normal_state:
                break
            time.sleep(0.5)
        self.assertTrue(normal_state, "Unstable cluster state")

        # Cheeck Consistency
        for i in range(self.max_load_generator):
            self.load_gen_list[i].quit()
        for i in range(self.max_load_generator):
            self.load_gen_list[i].join()
            self.assertTrue(self.load_gen_list[i].isConsistent(),
                            'Inconsistent after migration')
            self.load_gen_list.pop(i, None)
Exemple #6
0
    def elect_master_randomly(self):
        # set data
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway('0')
        gw.connect(ip, port)
        for i in range(0, 1000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to gw(%s:%d). cmd:%s, res:%s' %
                (ip, port, cmd[:-2], res[:-2]))

        server_ids = []
        for server in self.cluster['servers']:
            server_ids.append(server['id'])

        for try_cnt in range(30):
            # get master, slave1, slave2
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
            util.log('master id : %d' % m['id'])

            if try_cnt != 0:
                if m['id'] in server_ids:
                    server_ids.remove(m['id'])

            smr = smr_mgmt.SMR(m['id'])
            ret = smr.connect(m['ip'], m['smr_mgmt_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to master. %s:%d' %
                (m['ip'], m['smr_mgmt_port']))
            cmd = 'role lconn\r\n'
            smr.write(cmd)
            reply = smr.read_until('\r\n')
            self.assertEqual(
                reply, '+OK\r\n',
                'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2]))
            util.log('succeeded : cmd="%s", reply="%s"' %
                     (cmd[:-2], reply[:-2]))

            # wait until role-change is finished
            for role_change_try_cnt in range(5):
                count_master = 0
                count_slave = 0
                for server in self.cluster['servers']:
                    real_role = util.get_role_of_server(server)
                    real_role = util.roleNumberToChar(real_role)
                    if real_role == 'M':
                        count_master = count_master + 1
                    elif real_role == 'S':
                        count_slave = count_slave + 1
                if count_master == 1 and count_slave == 2:
                    break
                time.sleep(1)

            # check the number of master and slave
            self.assertEqual(
                count_master, 1,
                'failed : the number of master is not 1, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            self.assertEqual(
                count_slave, 2,
                'failed : the number of slave is not 2, count_master=%d, count_slave=%d'
                % (count_master, count_slave))
            util.log(
                'succeeded : the number of master is 1 and the number of slave is 2'
            )

            # check states of all pgs in pg
            for try_cnt in range(3):
                ok = True
                for s in self.cluster['servers']:
                    real_role = util.get_role_of_server(s)
                    real_role = util.roleNumberToChar(real_role)
                    smr_info = util.get_smr_info(s, self.leader_cm)
                    cc_role = smr_info['smr_Role']
                    cc_hb = smr_info['hb']

                    if cc_hb != 'Y':
                        ok = False
                    if real_role != cc_role:
                        ok = False

                    if ok:
                        util.log(
                            'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))
                    else:
                        util.log(
                            '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s'
                            % (s['id'], real_role, cc_role, cc_hb))

                if ok == False:
                    time.sleep(0.5)
                else:
                    break

            self.assertTrue(ok, 'failed : role check')

            if len(server_ids) == 0:
                util.log('succeeded : all smrs have been as a master')
                return 0

        self.assertEqual(
            0, len(server_ids), 'failed : remains server ids=[%s]' %
            (','.join('%d' % id for id in server_ids)))
        return 0
    def test_migration_with_expire_command(self):
        util.print_frame()

        util.log("start load_generator")
        load_gen_thrd_list = {}
        for i in range(1):
            ip, port = util.get_rand_gateway(self.cluster)
            load_gen_thrd_list[i] = load_generator.LoadGenerator(i, ip, port)
            load_gen_thrd_list[i].start()

        time.sleep(5)  # generate load for 5 sec
        tps = 20000
        src_pg_id = 0
        dst_pg_id = 1
        leader_cm = self.cluster['servers'][0]
        src_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', src_pg_id)
        dst_master = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                    'master', dst_pg_id)

        smr = smr_mgmt.SMR(src_master['id'])
        ret = smr.connect(src_master['ip'], src_master['smr_mgmt_port'])
        if ret != 0:
            util.log('failed to connect to smr(source master)')
            return False

        src_redis = redis_mgmt.Redis(src_master['id'])
        ret = src_redis.connect(src_master['ip'], src_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~beforeCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~beforeCheckpoint:persist', 20)

        self.setExpireS3Key(src_redis, 'S3:PermanentKey', 0)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(
            src_redis, 'S3:beforeCheckpoint~beforeCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        util.log(">>> migrate test with expire command start(%s), ts:%d" %
                 (time.asctime(), ts))

        ts = time.time()
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireKey(src_redis,
                          'beforeCheckpoint~afterCheckpoint:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:beforeCheckpoint~afterCheckpoint:persist', 20)

        # notify dst_redis of migration start
        util.log(">>> notify dst_redis of migration start (%s)" %
                 time.asctime())

        cmd = 'migconf migstart %d-%d\r\n' % (0, 8191)
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        # remote partial checkpoint
        util.log(">>> start remote checkpoint and load (%s)" % time.asctime())
        cmd = "./cluster-util --getandplay %s %d %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], dst_master['ip'],
            dst_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)

        ret = p.wait()
        for line in p.stdout:
            if line.find("Checkpoint Sequence Number:") != -1:
                util.log("seqnumber : " + line[line.rfind(":") + 1:])
                seq = int(line[line.rfind(":") + 1:])
            util.log(">>>" + str(line.rstrip()))

        self.assertEqual(0, ret)
        util.log(">>> end remote checkpoint and load (%s)" % time.asctime())

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:beforeCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        # bgsave for testing later about recovery during migration
        util.log(
            ">>> bgsave for testing later about recovery during migration (%s)"
            % time.asctime())
        cmd = 'bgsave\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+Background saving started\r\n')

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~afterCheckpoint:persist',
                          20)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~afterCheckpoint:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~afterCheckpoint:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:expired',
                          10)
        self.setExpireKey(src_redis, 'afterCheckpoint~duringCatchup:persist',
                          100)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:afterCheckpoint~duringCatchup:persist', 100)

        # remote catchup (smr log migration)
        util.log(">>> start remote catchup (%s)" % time.asctime())

        dst_host = dst_master['ip']
        dst_smr_port = dst_master['smr_base_port']
        rle = '1 8192'
        num_part = 8192

        smr.write('migrate start %s %d %d %d %d %s\r\n' %
                  (dst_host, dst_smr_port, seq, tps, num_part, rle))
        response = smr.read_until('\r\n')
        if response[:3] != '+OK':
            util.log('failed to execute migrate start command, response:%s' %
                     response)
            return False

        while True:
            smr.write('migrate info\r\n')
            response = smr.read_until('\r\n')
            seqs = response.split()
            logseq = int(seqs[1].split(':')[1])
            mig = int(seqs[2].split(':')[1])
            util.log('migrate info: %s' % response)
            if (logseq - mig < 500000):
                util.log('Remote catchup almost done. try mig2pc')
                break
            time.sleep(1)

        util.log(">>> sleep until 90 sec pass")
        self.assertFalse(time.time() - ts >= 90)
        time.sleep(90 - (time.time() - ts))

        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis,
                              'afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:afterCheckpoint~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~duringCatchup:persist', 20)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:expired', 10)
        self.setExpireS3Key(src_redis,
                            'S3:duringCatchup~duringCatchup:persist', 20)

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(src_redis, 'duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(src_redis,
                                'S3:duringCatchup~duringCatchup:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:expired', 10)
        self.setExpireKey(src_redis, 'duringCatchup~afterMig2pc:persist', 20)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:expired',
                            10)
        self.setExpireS3Key(src_redis, 'S3:duringCatchup~afterMig2pc:persist',
                            20)

        util.log(">>> remote catchup phase almost done (%s)" % time.asctime())

        # mig2pc
        util.log(">>> start mig2pc (%s)" % time.asctime())

        cmd = 'mig2pc %s %d %d %d %d' % (self.cluster['cluster_name'],
                                         src_pg_id, dst_pg_id, 0, 8191)
        result = util.cm_command(leader_cm['ip'], leader_cm['cm_port'], cmd)
        util.log('mig2pc result : ' + result)
        if not result.startswith('{"state":"success","msg":"+OK"}\r\n'):
            util.log('failed to execute mig2pc command, result:%s' % result)
            return False

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis,
                                'S3:duringCatchup~afterMig2pc:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:expired', 10)
        self.setExpireKey(dst_redis, 'afterMig2pc~migrateEnd:persist', 20)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired', 10)
        self.setExpireS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist', 20)

        # finish migration
        smr.write('migrate interrupt\r\n')
        response = smr.read_until('\r\n')
        util.log('migrate interrupt: %s' % response)
        smr.disconnect()

        # notify dst_redis of migration end
        util.log(">>> notify dst_redis of migration end (%s)" % time.asctime())

        cmd = 'migconf migend\r\n'
        dst_redis.write(cmd)
        res = dst_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        cmd = 'migconf clearstart %d-%d\r\n' % (0, 8191)
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEquals(res, '+OK\r\n')

        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistKey(dst_redis, 'afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:persist')
        self.assertEquals(res, ":1\r\n")
        res = self.persistS3Key(dst_redis, 'S3:afterMig2pc~migrateEnd:expired')
        self.assertEquals(res, ":0\r\n")

        ts = time.time()
        util.log(">>> sleep until 15 sec pass")
        self.assertFalse(time.time() - ts >= 15)
        time.sleep(15 - (time.time() - ts))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        # remote partial checkpoint
        util.log(">>> start rangedel (%s)" % time.asctime())
        cmd = "./cluster-util --rangedel %s %d %d-%d %d" % (
            src_master['ip'], src_master['redis_port'], 0, 8191, tps)
        p = util.exec_proc_async(util.cluster_util_dir(src_master['id']), cmd,
                                 True, None, subprocess.PIPE, None)
        ret = p.wait()

        for line in p.stdout:
            util.log(">>>" + str(line.rstrip()))

        cmd = 'migconf clearend\r\n'
        src_redis.write(cmd)
        res = src_redis.read_until('\r\n')
        self.assertEqual(res, '+OK\r\n')

        time.sleep(5)  # generate load for 5 sec
        # check consistency of load_generator
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].quit()
        for i in range(len(load_gen_thrd_list)):
            load_gen_thrd_list[i].join()
            self.assertTrue(load_gen_thrd_list[i].isConsistent(),
                            'Inconsistent after migration')

        # kill dst_redis and recover from bgsave
        util.log(">>> kill dst_redis and recover from bgsave (%s)" %
                 time.asctime())

        dst_redis.disconnect()
        ret = testbase.request_to_shutdown_redis(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_master)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_master['id'])

        ret = testbase.request_to_start_redis(dst_master)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_master['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_master)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_master['id']))

        dst_redis = redis_mgmt.Redis(dst_master['id'])
        ret = dst_redis.connect(dst_master['ip'], dst_master['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis, 'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis, 'S3:PermanentKey')

        # kill dst_slave redis and recover without dump file
        util.log(">>> kill dst_redis and recover without dump file (%s)" %
                 time.asctime())
        dst_slave = util.get_server_by_role_and_pg(self.cluster['servers'],
                                                   'slave', dst_pg_id)

        ret = testbase.request_to_shutdown_redis(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown redis')
        ret = testbase.request_to_shutdown_smr(dst_slave)
        self.assertEquals(ret, 0, 'failed to shutdown smr')
        time.sleep(5)

        testbase.request_to_start_smr(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start smr, server:%d' % dst_slave['id'])

        ret = testbase.request_to_start_redis(dst_slave)
        self.assertEqual(ret, 0,
                         'failed to start redis, server:%d' % dst_slave['id'])

        ret = testbase.wait_until_finished_to_set_up_role(dst_slave)
        self.assertEquals(
            ret, 0, 'failed to role change. server:%d' % (dst_slave['id']))

        dst_redis_slave = redis_mgmt.Redis(dst_slave['id'])
        ret = dst_redis_slave.connect(dst_slave['ip'], dst_slave['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~beforeCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~beforeCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'beforeCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:beforeCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~afterCheckpoint:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~afterCheckpoint:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'afterCheckpoint~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterCheckpoint~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave,
                         'duringCatchup~duringCatchup:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~duringCatchup:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'duringCatchup~afterMig2pc:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:duringCatchup~afterMig2pc:expired'))

        self.assertTrue(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isExist(dst_redis_slave, 'afterMig2pc~migrateEnd:expired'))
        self.assertTrue(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:persist'))
        self.assertFalse(
            self.isS3Exist(dst_redis_slave,
                           'S3:afterMig2pc~migrateEnd:expired'))

        self.getS3TTL(dst_redis_slave, 'S3:PermanentKey')

        # Go back to initial configuration
        self.assertTrue(
            util.migration(self.cluster, dst_pg_id, src_pg_id, 0, 8191, 40000),
            'failed to rollback migration')
    def master_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr = smr_mgmt.SMR(m['id'])
        ret = smr.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(5)

        # wait for forced master election
        success = False
        for i in range(20):
            role = util.get_role_of_server(s1)
            if role == c.ROLE_MASTER:
                success = True
                break

            if len(self.cluster['servers']) == 3:
                role = util.get_role_of_server(s2)
                if role == c.ROLE_MASTER:
                    success = True
                    break
            time.sleep(1)

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        self.assertEqual(success, True, 'failed to forced master election')

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        if len(self.cluster['servers']) == 3:
            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # check new values
            for i in range(10000, 20000):
                cmd = 'get %s%d\r\n' % (self.key_base, i)
                redis2.write(cmd)
                redis2.read_until('\r\n')
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '%d\r\n' % i,
                    'failed to get values from redis2. %s != %d' % (res, i))

        # check if the haning server recovered and joined as a slave
        time.sleep(7)
        role = util.get_role_of_server(m)
        self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave')

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEquals(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
    def failover_while_hang(self, server):
        # timestamp before hang
        ts_before = util.get_timestamp_of_pgs(server)
        self.assertNotEqual(
            ts_before, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (server['id'], ts_before))

        # hang
        util.log('pgs(id:%d, ip:%s, port:%d) is going to hang.' %
                 (server['id'], server['ip'], server['smr_mgmt_port']))
        smr = smr_mgmt.SMR(server['id'])
        ret = smr.connect(server['ip'], server['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (server['ip'], server['smr_mgmt_port']))
        smr.write('fi delay sleep 1 10000\r\n')
        reply = smr.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        time.sleep(4)

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # shutdown
        util.log('shutdown pgs%d while hanging.' % server['id'])
        ret = testbase.request_to_shutdown_smr(server)
        self.assertEqual(ret, 0,
                         'failed to shutdown smr. id:%d' % server['id'])
        ret = testbase.request_to_shutdown_redis(server)
        self.assertEquals(ret, 0,
                          'failed to shutdown redis. id:%d' % server['id'])

        # check state F
        max_try = 20
        expected = 'F'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to F.' % server['id'])

        # recovery
        util.log('restart pgs%d.' % server['id'])
        ret = testbase.request_to_start_smr(server)
        self.assertEqual(ret, 0, 'failed to start smr. id:%d' % server['id'])

        ret = testbase.request_to_start_redis(server)
        self.assertEqual(ret, 0, 'failed to start redis. id:%d' % server['id'])

        wait_count = 20
        ret = testbase.wait_until_finished_to_set_up_role(server, wait_count)
        self.assertEquals(ret, 0,
                          'failed to role change. smr_id:%d' % (server['id']))

        redis = redis_mgmt.Redis(server['id'])
        ret = redis.connect(server['ip'], server['redis_port'])
        self.assertEquals(ret, 0, 'failed to connect to redis')

        # check state N
        max_try = 20
        expected = 'N'
        for i in range(0, max_try):
            state = util.get_smr_state(server, self.leader_cm)
            if expected == state:
                break
            time.sleep(1)
        self.assertEquals(
            expected, state, 'server%d - state:%s, expected:%s' %
            (server['id'], state, expected))
        util.log('succeeded : pgs%d state changed to N.' % server['id'])

        # wait for rejoin as a slave
        success = False
        for i in range(20):
            role = util.get_role_of_server(server)
            if role == c.ROLE_SLAVE:
                ts_after = util.get_timestamp_of_pgs(server)
                if ts_after != -1 and ts_before != ts_after:
                    success = True
                    break
            time.sleep(1)
        self.assertEqual(success, True, 'failed to rejoin as a slave')
        util.log('succeeded : pgs%d joined as a slave.' % server['id'])

        return 0
    def test_all_pgs_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave1 = smr_mgmt.SMR(s1['id'])
        ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))
        smr_slave2 = smr_mgmt.SMR(s2['id'])
        ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s2['ip'], s2['smr_mgmt_port']))

        m_ts = util.get_timestamp_of_pgs(m)
        s1_ts = util.get_timestamp_of_pgs(s1)
        s2_ts = util.get_timestamp_of_pgs(s2)

        smr_master.write('fi delay sleep 1 8000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave1.write('fi delay sleep 1 8000\r\n')
        smr_slave2.write('fi delay sleep 1 8000\r\n')

        time.sleep(10)

        # check consistency
        ok = False
        for try_cnt in xrange(20):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            if ok:
                break
            time.sleep(0.5)
        self.assertTrue(ok, 'Unstable cluster state')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port']))

        # set values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis0.write(cmd)
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check new values (s2)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s2['id'], res[:-2], i))

        # check consistency
        ok = False
        for try_cnt in range(0, 10):
            ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                                    self.mgmt_port)
            print ok
            if ok:
                break
            time.sleep(1)
        self.assertEqual(ok, True, 'role consistency fail')

        return 0
    def test_two_slaves_hang(self):
        util.print_frame()

        self.setup_test_cluster(self.cluster_3copy)

        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)
        self.assertNotEqual(m, None, 'master is None.')
        self.assertNotEqual(s1, None, 'slave1 is None.')
        self.assertNotEqual(s2, None, 'slave2 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # timestamp before hang
        ts_before1 = util.get_timestamp_of_pgs(s1)
        self.assertNotEqual(
            ts_before1, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s1['id'], ts_before1))

        ts_before2 = util.get_timestamp_of_pgs(s2)
        self.assertNotEqual(
            ts_before2, -1,
            'failed to get a timestamp of pgs(%d), ts_before:%d' %
            (s2['id'], ts_before2))

        # hang
        smr1 = smr_mgmt.SMR(s1['id'])
        ret = smr1.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr2 = smr_mgmt.SMR(s2['id'])
        ret = smr2.connect(s2['ip'], s2['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr1.write('fi delay sleep 1 8000\r\n')
        reply = smr1.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr2.write('fi delay sleep 1 8000\r\n')
        time.sleep(7)

        success = False
        for i in xrange(20):
            ret = util.check_cluster(self.cluster['cluster_name'],
                                     self.mgmt_ip,
                                     self.mgmt_port,
                                     check_quorum=True)
            if ret:
                success = True
                break
            time.sleep(1)
        self.assertEqual(success, True, 'unstable cluster')

        # get master, slave1, slave2
        m, s1, s2 = util.get_mss(self.cluster)

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        redis2 = redis_mgmt.Redis(s2['id'])
        ret = redis2.connect(s2['ip'], s2['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis(%s:%d).' %
            (s2['ip'], s2['redis_port']))

        # set new values
        for i in range(10000, 20000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            redis1.write(cmd)
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values to redis1. cmd:%s, res:%s' %
                (cmd[:-2], res))

        # check new values
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis2.write(cmd)
            redis2.read_until('\r\n')
            res = redis2.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis2. %s != %d' % (res, i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0
    def master_and_slave_hang(self):
        # get gateway info
        ip, port = util.get_rand_gateway(self.cluster)
        gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id'])
        ret = gw.connect(ip, port)
        self.assertEqual(ret, 0,
                         'failed to connect to gateway, %s:%d' % (ip, port))

        # set values
        for i in range(0, 10000):
            cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
            gw.write(cmd)
            res = gw.read_until('\r\n')
            self.assertEqual(
                res, '+OK\r\n',
                'failed to set values. cmd:%s, res:%s' % (cmd, res))

        # get master, slave1, slave2
        if len(self.cluster['servers']) == 3:
            m, s1, s2 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')
            self.assertNotEqual(s2, None, 'slave2 is None.')
        else:
            m, s1 = util.get_mss(self.cluster)
            self.assertNotEqual(m, None, 'master is None.')
            self.assertNotEqual(s1, None, 'slave1 is None.')

        util.log('server state before hang')
        util.log_server_state(self.cluster)

        # hang
        smr_master = smr_mgmt.SMR(m['id'])
        ret = smr_master.connect(m['ip'], m['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (m['ip'], m['smr_mgmt_port']))
        smr_slave = smr_mgmt.SMR(s1['id'])
        ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to master. %s:%d' %
            (s1['ip'], s1['smr_mgmt_port']))

        smr_master.write('fi delay sleep 1 10000\r\n')
        reply = smr_master.read_until('\r\n', 1)
        if reply != None and reply.find('-ERR not supported') != -1:
            self.assertEqual(
                0, 1, 'make sure that smr has compiled with gcov option.')

        smr_slave.write('fi delay sleep 1 10000\r\n')

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        time.sleep(5)

        if len(self.cluster['servers']) == 3:
            # wait for forced master election
            success = True
            for i in range(15):
                state = []
                util.check_cluster(self.cluster['cluster_name'],
                                   self.leader_cm['ip'],
                                   self.leader_cm['cm_port'], state)
                s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0]
                role = s2_state['active_role']
                if role != 'M':
                    success = False
                    break
                time.sleep(1)

            util.log('')
            util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2')
            util.log('')
            util.log_server_state(self.cluster)

            self.assertEqual(success, True, 'failed to check copy-quorum')

            ok = False
            for i in xrange(10):
                ok = util.check_cluster(self.cluster['cluster_name'],
                                        self.leader_cm['ip'],
                                        self.leader_cm['cm_port'])
                if ok:
                    break
            self.assertTrue(ok, 'Cluster state is not normal!')

            redis2 = redis_mgmt.Redis(s2['id'])
            ret = redis2.connect(s2['ip'], s2['redis_port'])
            self.assertEqual(
                ret, 0, 'failed to connect to redis(%s:%d).' %
                (s2['ip'], s2['redis_port']))

            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis2.write(cmd)
                res = redis2.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis1. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        util.log('server state transition after hang')
        util.log_server_state(self.cluster)

        redis0 = redis_mgmt.Redis(m['id'])
        ret = redis0.connect(m['ip'], m['redis_port'])
        self.assertEqual(
            ret, 0,
            'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port']))

        redis1 = redis_mgmt.Redis(s1['id'])
        ret = redis1.connect(s1['ip'], s1['redis_port'])
        self.assertEqual(
            ret, 0, 'failed to connect to redis1(%s:%d).' %
            (s1['ip'], s1['redis_port']))

        if len(self.cluster['servers']) != 3:
            # set new values
            for i in range(10000, 20000):
                cmd = 'set %s%d %d\r\n' % (self.key_base, i, i)
                redis0.write(cmd)
                res = redis0.read_until('\r\n')
                self.assertEqual(
                    res, '+OK\r\n',
                    'failed to set values to redis0. cmd:%s, res:%s' %
                    (cmd[:-2], res))

        # check new values (m)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis0.write(cmd)
            redis0.read_until('\r\n')
            res = redis0.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (m['id'], res, i))

        # check new values (s1)
        for i in range(10000, 20000):
            cmd = 'get %s%d\r\n' % (self.key_base, i)
            redis1.write(cmd)
            redis1.read_until('\r\n')
            res = redis1.read_until('\r\n')
            self.assertEqual(
                res, '%d\r\n' % i,
                'failed to get values from redis(id:%d). %s != %d' %
                (s1['id'], res[:-2], i))

        # check consistency
        self.assertEqual(
            util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip,
                               self.mgmt_port), True, 'role consistency fail')

        return 0