def master_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr.write('fi delay sleep 1 10000\r\n') reply = smr.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') time.sleep(5) # wait for forced master election success = False for i in range(20): role = util.get_role_of_server(s1) if role == c.ROLE_MASTER: success = True break if len(self.cluster['servers']) == 3: role = util.get_role_of_server(s2) if role == c.ROLE_MASTER: success = True break time.sleep(1) util.log('server state transition after hang') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to forced master election') redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check if the haning server recovered and joined as a slave time.sleep(7) role = util.get_role_of_server(m) self.assertEqual(role, c.ROLE_SLAVE, 'failed to join as a slave') redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) util.log('server state transition after hang') util.log_server_state(self.cluster) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def master_failover_while_hang(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) self.failover_while_hang(m) util.log('server state transition after hang') util.log_server_state(self.cluster) redis1 = redis_mgmt.Redis(m['id']) ret = redis1.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) if len(self.cluster['servers']) == 3: redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) util.log( 'succeeded : check values with set/get operations with pgs%d and pgs%d.' % (m['id'], s2['id'])) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_two_slaves_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # timestamp before hang ts_before1 = util.get_timestamp_of_pgs(s1) self.assertNotEqual( ts_before1, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s1['id'], ts_before1)) ts_before2 = util.get_timestamp_of_pgs(s2) self.assertNotEqual( ts_before2, -1, 'failed to get a timestamp of pgs(%d), ts_before:%d' % (s2['id'], ts_before2)) # hang smr1 = smr_mgmt.SMR(s1['id']) ret = smr1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr2 = smr_mgmt.SMR(s2['id']) ret = smr2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr1.write('fi delay sleep 1 8000\r\n') reply = smr1.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr2.write('fi delay sleep 1 8000\r\n') time.sleep(7) success = False for i in xrange(20): ret = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port, check_quorum=True) if ret: success = True break time.sleep(1) self.assertEqual(success, True, 'unstable cluster') # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis1.write(cmd) res = redis1.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis2. %s != %d' % (res, i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def test_all_pgs_hang(self): util.print_frame() self.setup_test_cluster(self.cluster_3copy) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave1 = smr_mgmt.SMR(s1['id']) ret = smr_slave1.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_slave2 = smr_mgmt.SMR(s2['id']) ret = smr_slave2.connect(s2['ip'], s2['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s2['ip'], s2['smr_mgmt_port'])) m_ts = util.get_timestamp_of_pgs(m) s1_ts = util.get_timestamp_of_pgs(s1) s2_ts = util.get_timestamp_of_pgs(s2) smr_master.write('fi delay sleep 1 8000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave1.write('fi delay sleep 1 8000\r\n') smr_slave2.write('fi delay sleep 1 8000\r\n') time.sleep(10) # check consistency ok = False for try_cnt in xrange(20): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) if ok: break time.sleep(0.5) self.assertTrue(ok, 'Unstable cluster state') util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (m['ip'], m['redis_port'])) # set values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s1['ip'], s1['redis_port'])) redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check new values (s2) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis2.write(cmd) redis2.read_until('\r\n') res = redis2.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s2['id'], res[:-2], i)) # check consistency ok = False for try_cnt in range(0, 10): ok = util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port) print ok if ok: break time.sleep(1) self.assertEqual(ok, True, 'role consistency fail') return 0
def state_transition(self): server = util.get_server_by_role(self.cluster['servers'], 'slave') self.assertNotEquals(server, None, 'failed to get_server_by_role-slave') # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) # check initial state state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s, expected:N' % (server['id'], state, role)) # shutdown ret = testbase.request_to_shutdown_smr(server) self.assertEquals(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(3) # check state F expected = 'F' state = self.get_expected_smr_state(server, expected) self.assertEquals( expected, state, 'server%d - state:%s, but expected:%s' % (server['id'], state, expected)) # set value ret = gw.connect(ip, port) self.assertEquals(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) timestamp = 0.0 for i in range(0, 100): timestamp = time.time() key = 'new_key_haha' cmd = 'set %s %f\r\n' % (key, timestamp) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery ret = testbase.request_to_start_smr(server) self.assertEquals(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEquals(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, 10) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) time.sleep(5) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N expected = 'N' max_try = 20 for i in range(0, max_try): state = self.get_expected_smr_state(server, expected) if state == expected: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, role:%s, but expected:%s' % (server['id'], state, role, expected))
def master_and_slave_hang(self): # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set values for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values. cmd:%s, res:%s' % (cmd, res)) # get master, slave1, slave2 if len(self.cluster['servers']) == 3: m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') else: m, s1 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') util.log('server state before hang') util.log_server_state(self.cluster) # hang smr_master = smr_mgmt.SMR(m['id']) ret = smr_master.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) smr_slave = smr_mgmt.SMR(s1['id']) ret = smr_slave.connect(s1['ip'], s1['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (s1['ip'], s1['smr_mgmt_port'])) smr_master.write('fi delay sleep 1 10000\r\n') reply = smr_master.read_until('\r\n', 1) if reply != None and reply.find('-ERR not supported') != -1: self.assertEqual( 0, 1, 'make sure that smr has compiled with gcov option.') smr_slave.write('fi delay sleep 1 10000\r\n') util.log('server state transition after hang') util.log_server_state(self.cluster) time.sleep(5) if len(self.cluster['servers']) == 3: # wait for forced master election success = True for i in range(15): state = [] util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port'], state) s2_state = filter(lambda s: s['pgs_id'] == s2['id'], state)[0] role = s2_state['active_role'] if role != 'M': success = False break time.sleep(1) util.log('') util.log('It expects that pgs2 is a master. PG.COPY: 3, PG.Q: 2') util.log('') util.log_server_state(self.cluster) self.assertEqual(success, True, 'failed to check copy-quorum') ok = False for i in xrange(10): ok = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break self.assertTrue(ok, 'Cluster state is not normal!') redis2 = redis_mgmt.Redis(s2['id']) ret = redis2.connect(s2['ip'], s2['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis(%s:%d).' % (s2['ip'], s2['redis_port'])) # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis2.write(cmd) res = redis2.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis1. cmd:%s, res:%s' % (cmd[:-2], res)) util.log('server state transition after hang') util.log_server_state(self.cluster) redis0 = redis_mgmt.Redis(m['id']) ret = redis0.connect(m['ip'], m['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis0(%s:%d).' % (m['ip'], m['redis_port'])) redis1 = redis_mgmt.Redis(s1['id']) ret = redis1.connect(s1['ip'], s1['redis_port']) self.assertEqual( ret, 0, 'failed to connect to redis1(%s:%d).' % (s1['ip'], s1['redis_port'])) if len(self.cluster['servers']) != 3: # set new values for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) redis0.write(cmd) res = redis0.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to redis0. cmd:%s, res:%s' % (cmd[:-2], res)) # check new values (m) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis0.write(cmd) redis0.read_until('\r\n') res = redis0.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (m['id'], res, i)) # check new values (s1) for i in range(10000, 20000): cmd = 'get %s%d\r\n' % (self.key_base, i) redis1.write(cmd) redis1.read_until('\r\n') res = redis1.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis(id:%d). %s != %d' % (s1['id'], res[:-2], i)) # check consistency self.assertEqual( util.check_cluster(self.cluster['cluster_name'], self.mgmt_ip, self.mgmt_port), True, 'role consistency fail') return 0
def consistent_after_failover(self): max = 10000 wait_count = 15 key = 'caf' # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # set value ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(ip) gw.connect(ip, port) for i in range(0, max): cmd = 'set %s%d %d\r\n' % (key, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') time.sleep(5) # shutdown servers = [master, slave1, slave2] for server in servers: util.log('before shutdown pgs%d' % server['id']) for s in servers: self.getseq_log(s) ret = testbase.request_to_shutdown_smr(server) self.assertEqual( ret, 0, 'failed to shutdown smr, server:%d' % server['id']) ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') time.sleep(5) # check state F for server in servers: state = self.get_expected_smr_state(server, 'F') self.assertEquals('F', state, 'server%d - state:%s' % (server['id'], state)) # recovery for server in servers: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr, server:%d' % server['id']) ret = testbase.request_to_start_redis(server, False) self.assertEqual(ret, 0, 'failed to start redis, server:%d' % server['id']) util.log('after restart pgs%d' % server['id']) for s in servers: self.getseq_log(s) time.sleep(5) # wait for master election for i in xrange(10): ret = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ret: break time.sleep(1) # check state for server in servers: ret = testbase.wait_until_finished_to_set_up_role( server, wait_count) self.assertEquals( ret, 0, 'failed to role change. server:%d' % (server['id'])) state = self.get_expected_smr_state(server, 'N') role = util.get_role_of_server(server) self.assertEquals( 'N', state, 'server%d - state:%s, role:%s' % (server['id'], state, role)) the_number_of_master = 0 the_number_of_slave = 0 for server in servers: role = util.get_role_of_server(server) if role == c.ROLE_MASTER: the_number_of_master = the_number_of_master + 1 elif role == c.ROLE_SLAVE: the_number_of_slave = the_number_of_slave + 1 self.assertTrue( 1 == the_number_of_master and 2 == the_number_of_slave, 'failed to set roles, the number of master:%d, the number of slave:%d' % (the_number_of_master, the_number_of_slave)) # get master, slave1, and slave2 master, slave1, slave2 = self.get_mss() # connect to a master`s redis and set data redis = redis_mgmt.Redis(master['id']) ret = redis.connect(master['ip'], master['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % master['id']) for i in range(max, max * 2): cmd = 'set %s%d %d\r\n' % (key, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals( res, '+OK\r\n', 'failed to get response, server:%d' % master['id']) redis.disconnect() # check slaves`s data slaves = [slave1, slave2] for slave in slaves: slave_redis = redis_mgmt.Redis(slave['id']) ret = slave_redis.connect(slave['ip'], slave['redis_port']) self.assertEquals( ret, 0, 'failed to connect to redis, server:%d' % slave['id']) for i in range(0, max * 2): cmd = 'get %s%d\r\n' % (key, i) slave_redis.write(cmd) trash = slave_redis.read_until('\r\n') res = slave_redis.read_until('\r\n') self.assertEquals( res, '%d\r\n' % i, 'inconsistent, server:%d, expected %d but %s' % (slave['id'], i, res)) slave_redis.disconnect()
def elect_master_randomly(self): # set data ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 1000): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) server_ids = [] for server in self.cluster['servers']: server_ids.append(server['id']) for try_cnt in range(30): # get master, slave1, slave2 m, s1, s2 = util.get_mss(self.cluster) self.assertNotEqual(m, None, 'master is None.') self.assertNotEqual(s1, None, 'slave1 is None.') self.assertNotEqual(s2, None, 'slave2 is None.') util.log('master id : %d' % m['id']) if try_cnt != 0: if m['id'] in server_ids: server_ids.remove(m['id']) smr = smr_mgmt.SMR(m['id']) ret = smr.connect(m['ip'], m['smr_mgmt_port']) self.assertEqual( ret, 0, 'failed to connect to master. %s:%d' % (m['ip'], m['smr_mgmt_port'])) cmd = 'role lconn\r\n' smr.write(cmd) reply = smr.read_until('\r\n') self.assertEqual( reply, '+OK\r\n', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], reply[:-2])) # wait until role-change is finished for role_change_try_cnt in range(5): count_master = 0 count_slave = 0 for server in self.cluster['servers']: real_role = util.get_role_of_server(server) real_role = util.roleNumberToChar(real_role) if real_role == 'M': count_master = count_master + 1 elif real_role == 'S': count_slave = count_slave + 1 if count_master == 1 and count_slave == 2: break time.sleep(1) # check the number of master and slave self.assertEqual( count_master, 1, 'failed : the number of master is not 1, count_master=%d, count_slave=%d' % (count_master, count_slave)) self.assertEqual( count_slave, 2, 'failed : the number of slave is not 2, count_master=%d, count_slave=%d' % (count_master, count_slave)) util.log( 'succeeded : the number of master is 1 and the number of slave is 2' ) # check states of all pgs in pg for try_cnt in range(3): ok = True for s in self.cluster['servers']: real_role = util.get_role_of_server(s) real_role = util.roleNumberToChar(real_role) smr_info = util.get_smr_info(s, self.leader_cm) cc_role = smr_info['smr_Role'] cc_hb = smr_info['hb'] if cc_hb != 'Y': ok = False if real_role != cc_role: ok = False if ok: util.log( 'succeeded : a role of real pgs is the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) else: util.log( '\n\n**********************************************************\n\nretry: a role of real pgs is not the same with a role in cc, id=%d, real=%s, cc=%s, hb=%s' % (s['id'], real_role, cc_role, cc_hb)) if ok == False: time.sleep(0.5) else: break self.assertTrue(ok, 'failed : role check') if len(server_ids) == 0: util.log('succeeded : all smrs have been as a master') return 0 self.assertEqual( 0, len(server_ids), 'failed : remains server ids=[%s]' % (','.join('%d' % id for id in server_ids))) return 0
def test_5_transfer_pgs_to_another_machine(self): util.print_frame() self.load_gen_list = {} # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # incrase master generation number util.log('failover in order to increase master generation number.') max = 0 for i in range(5): key_base = 'key' for i in range(max, max + 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') max = max + 10000 m = util.get_server_by_role(self.cluster['servers'], 'master') util.log('failover pgs%d' % m['id']) ret = util.failover(m, self.leader_cm) self.assertTrue(ret, 'failed to failover pgs%d' % m['id']) # start load generator util.log("start load_generator") for i in range(self.max_load_generator): ip, port = util.get_rand_gateway(self.cluster) self.load_gen_list[i] = load_generator.LoadGenerator(i, ip, port) self.load_gen_list[i].start() time.sleep(5) # generate load for 5 sec util.log("started load_generator") m, s1, s2 = util.get_mss(self.cluster) servers = [m, s1, s2] # bgsave for s in servers: ret = util.bgsave(s) self.assertTrue(ret, 'failed to bgsave. pgs%d' % s['id']) new_servers = [config.server4, config.server5] # add new slaves for s in new_servers: util.log('delete pgs%d`s check point.' % s['id']) util.del_dumprdb(s['id']) ret = util.cluster_util_getdump(s['id'], m['ip'], m['redis_port'], 'dump.rdb', 0, 8191) self.assertEqual( True, ret, 'failed : util.cluster_util_getdump returns false, src=%s:%d dest_pgsid=%d' % (m['ip'], m['redis_port'], s['id'])) ret = util.pgs_add(self.cluster, s, self.leader_cm, 0, rm_ckpt=False) self.assertEqual( True, ret, 'failed : util.pgs_add returns false, pgsid=%d' % s['id']) util.log('succeeeded : add a new slave, pgsid=%d' % s['id']) # check consistency ok = True for j in range(self.max_load_generator): if self.load_gen_list[j].isConsistent() == False: ok = False break if not ok: break for server_to_del in servers: for s in servers: util.pingpong(s['ip'], s['smr_mgmt_port']) for s in new_servers: util.pingpong(s['ip'], s['smr_mgmt_port']) self.__del_server(server_to_del) util.log('succeeded : delete pgs%d' % server_to_del['id']) new_m = util.get_server_by_role(new_servers, 'master') new_s = util.get_server_by_role(new_servers, 'slave') self.assertNotEqual(new_m, None, 'master is None.') self.assertNotEqual(new_s, None, 'slave is None.') for s in new_servers: util.pingpong(s['ip'], s['smr_mgmt_port']) time.sleep(5) # generate load for 5 sec # check consistency of load_generator for i in range(self.max_load_generator): self.load_gen_list[i].quit() for i in range(self.max_load_generator): self.load_gen_list[i].join() self.assertTrue(self.load_gen_list[i].isConsistent(), 'Inconsistent after migration') self.load_gen_list.pop(i, None)
def pgs_add_and_del(self, upgrade_server, type): util.print_frame() util.log('[start] add and del pgs%d. type:%s' % (upgrade_server['id'], type)) util.log_server_state(self.cluster) # start load generator load_gen_list = {} for i in range(len(self.cluster['servers'])): server = self.cluster['servers'][i] load_gen = load_generator.LoadGenerator(server['id'], server['ip'], server['gateway_port']) load_gen.start() load_gen_list[i] = load_gen # detach pgs from cluster cmd = 'pgs_leave %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual( jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) # set new values ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway('0') gw.connect(ip, port) for i in range(0, 50): cmd = 'set %s%d %d\r\n' % (self.key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEqual( res, '+OK\r\n', 'failed to set values to gw(%s:%d). cmd:%s, res:%s' % (ip, port, cmd[:-2], res[:-2])) # attach pgs to cluster cmd = 'pgs_join %s %d\r\n' % (upgrade_server['cluster_name'], upgrade_server['id']) ret = util.cm_command(self.leader_cm['ip'], self.leader_cm['cm_port'], cmd) jobj = json.loads(ret) self.assertEqual(jobj['msg'], '+OK', 'failed : cmd="%s", reply="%s"' % (cmd[:-2], ret)) util.log('succeeded : cmd="%s", reply="%s"' % (cmd[:-2], ret[:-2])) time.sleep(3) stable = False for i in xrange(20): stable = util.check_cluster(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if stable: break time.sleep(0.5) self.assertTrue(stable, 'Unstable cluster') # check new values redis = redis_mgmt.Redis(upgrade_server['id']) ret = redis.connect(upgrade_server['ip'], upgrade_server['redis_port']) self.assertEquals( ret, 0, 'failed : connect to smr%d(%s:%d)' % (upgrade_server['id'], upgrade_server['ip'], upgrade_server['redis_port'])) for i in range(0, 50): cmd = 'get %s%d\r\n' % (self.key_base, i) redis.write(cmd) redis.read_until('\r\n') res = redis.read_until('\r\n') self.assertEqual( res, '%d\r\n' % i, 'failed to get values from redis%d. %s != %d' % (upgrade_server['id'], res, i)) util.log('succeeded : check values with get operations on pgs%d.' % (upgrade_server['id'])) # shutdown load generators for i in range(len(load_gen_list)): load_gen_list[i].quit() load_gen_list[i].join() util.log_server_state(self.cluster) return 0
def recovery_with_local_checkpoint_and_remote_log( self, role ): server = util.get_server_by_role( self.cluster['servers'], role ) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # set value ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( server['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway, id:%d' % server['id'] ) timestamp = {} key_base = 'key0000000000111111111122222222223333333333444444444455555555556666666666777777777788888888889999999999' for i in range (0, 50000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write( cmd ) response = gw.read_until( '\r\n' ) self.assertNotEqual( response.find( '+OK' ), -1, 'failed to set key value through gateway' ) # generate a check point bgsave_ret = util.bgsave( server ) self.assertTrue( bgsave_ret, 'failed to bgsave. pgs%d' % server['id'] ) # shutdown ret = testbase.request_to_shutdown_smr( server ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) ret = testbase.request_to_shutdown_redis( server ) self.assertEqual( ret, 0, 'failed to shutdown redis' ) util.log('succeeded : shutdown pgs%d' % (server['id'])) # delete smr_logs ret = util.delete_smr_logs( server['id'] ) self.assertEqual( ret, 0, 'failed to delete smr log, id:%d' % server['id'] ) util.log('succeeded : delete replication logs') time.sleep( 5 ) # set value ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) for i in range (50000, 100000): timestamp[i] = time.time() k = '%s_%d' % (key_base, i) cmd = 'set %s %f\r\n' % (k, timestamp[i]) gw.write( cmd ) response = gw.read_until( '\r\n' ) self.assertNotEqual( response.find( '+OK' ), -1, 'failed to set key value through gateway' ) # recovery ret = testbase.request_to_start_smr( server ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( server ) self.assertEqual( ret, 0, 'failed to start redis' ) time.sleep( 5 ) ret = testbase.wait_until_finished_to_set_up_role( server ) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server['id']) ) util.log('succeeded : recover pgs%d' % server['id']) # check value recovered_redis = redis_mgmt.Redis( server['id'] ) ret = recovered_redis .connect( server['ip'], server['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) for i in range (0, 100000): k = '%s_%d' % (key_base, i) cmd = 'get %s\r\n' % (k) recovered_redis .write( cmd ) recovered_redis.read_until( '\r\n' ) response = recovered_redis.read_until( '\r\n' ) self.assertEqual( response, '%f\r\n' % (timestamp[i]), 'inconsistent %s, %f' % (response, timestamp[i]) )
def test_restart_recovery_with_remote_checkpoint_and_remote_log( self ): util.print_frame() key_base = 'key' target = util.get_server_by_role( self.cluster['servers'], 'slave' ) master = util.get_server_by_role( self.cluster['servers'], 'master' ) ip, port = util.get_rand_gateway( self.cluster ) gw = gateway_mgmt.Gateway( master['id'] ) ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) # set initial data in order to make an elapsed time for bgsave longer self.put_some_data() # generate some data for i in range( 0, 100 ): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # delete a local checkpoint util.log('delete pgs%d`s check point.' % target['id']) util.del_dumprdb( target['id'] ) # generate a remote check point bgsave_ret = util.bgsave( master ) self.assertTrue( bgsave_ret, 'failed to bgsave. pgs%d' % master['id'] ) # shutdown util.log('shutdown target') ret = testbase.request_to_shutdown_smr( target ) self.assertEqual( ret, 0, 'failed to shutdown smr' ) time.sleep( 10 ) # generate some data ret = gw.connect( ip, port ) self.assertEqual( ret, 0, 'failed to connect to gateway' ) for i in range( 100, 200 ): key = '%s%d' % (key_base, i) cmd = 'set %s %d\r\n' % (key, i) gw.write( cmd ) res = gw.read_until( '\r\n' ) self.assertEquals( res, '+OK\r\n' ) gw.disconnect() # recovery util.log('recovery target') ret = testbase.request_to_start_smr( target ) self.assertEqual( ret, 0, 'failed to start smr' ) ret = testbase.request_to_start_redis( target ) self.assertEqual( ret, 0, 'failed to start redis' ) time.sleep( 5 ) ret = testbase.wait_until_finished_to_set_up_role( target) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (target['id']) ) # check value recovered_redis = redis_mgmt.Redis( target['id'] ) ret = recovered_redis .connect( target['ip'], target['redis_port'] ) self.assertEquals( ret, 0, 'failed to connect to redis' ) for i in range (0, 200): key = '%s%d' % (key_base, i) cmd = 'get %s\r\n' % (key) recovered_redis .write( cmd ) recovered_redis.read_until( '\r\n' ) response = recovered_redis.read_until( '\r\n' ) self.assertEqual( response, '%d\r\n' % i, 'inconsistent %s, %d' % (response, i) )
def failure_recovery(self, role, wait_count=10, redis_only=False): time.sleep(2) # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # set value key = 'new_key_haha' cmd = 'set %s 12345\r\n' % (key) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # shutdown server = util.get_server_by_role(self.cluster['servers'], role) if redis_only == False: ret = testbase.request_to_shutdown_smr(server) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server['id'], state, expected)) # set value check_value = '54321' cmd = 'set %s %s\r\n' % (key, check_value) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') gw.disconnect() # recovery if redis_only == False: ret = testbase.request_to_start_smr(server) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server, wait_count) self.assertEquals(ret, 0, 'failed to role change. smr_id:%d' % (server['id'])) redis = redis_mgmt.Redis(server['id']) ret = redis.connect(server['ip'], server['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) # check value cmd = 'get %s\r\n' % (key) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%s\r\n' % (check_value), 'inconsistent %s, %s' % (response, check_value))
def deprecated_test_5_PGS_commit_is_greater_than_PG_commit(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) master, s1, s2 = util.get_mss(self.cluster) server_to_join = [s1, s2] # shutdown slaves for i in range(0, 2): ret = testbase.request_to_shutdown_smr(server_to_join[i]) self.assertEqual( ret, 0, 'failed to shutdown smr%d' % server_to_join[i]['id']) util.log('succeeded to shutdown smr%d' % server_to_join[i]['id']) ret = testbase.request_to_shutdown_redis(server_to_join[i]) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown redis%d' % server_to_join[i]['id']) # check state F max_try = 20 expected = 'F' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join[i]['id'], state, expected)) # put more data util.put_some_data(self.cluster, 10, 256) # bgsave ret = util.bgsave(master) self.assertTrue(ret, 'failed to bgsave. pgs%d' % master['id']) # shutdown master ret = testbase.request_to_shutdown_smr(master) self.assertEqual(ret, 0, 'failed to shutdown smr') util.log('succeeded to shutdown master smr, id=%d' % master['id']) ret = testbase.request_to_shutdown_redis(master) self.assertEquals(ret, 0, 'failed to shutdown redis') util.log('succeeded to shutdown master redis, id=%d' % master['id']) # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (master['id'], state, expected)) # recovery slaves for i in range(0, 2): ret = testbase.request_to_start_smr(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join[i]) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role( server_to_join[i], 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join[i]['id'])) # check state N max_try = 20 expected = 'N' for j in range(0, max_try): state = util.get_smr_state(server_to_join[i], self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server_to_join[i]) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server_to_join[i]['id'], state, expected, role)) # set value s = random.choice(server_to_join) redis = redis_mgmt.Redis(['id']) ret = redis.connect(s['ip'], s['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') key_base = 'key_test' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) redis.write(cmd) res = redis.read_until('\r\n') self.assertEquals(res, '+OK\r\n') redis.disconnect() for i in range(0, 2): redis = redis_mgmt.Redis(server_to_join[i]['id']) ret = redis.connect(server_to_join[i]['ip'], server_to_join[i]['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for j in range(0, 10000): cmd = 'get %s%d\r\n' % (key_base, j) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (j), 'inconsistent %s, %d' % (response[:-2], j)) # try to recover master, but failed ret = testbase.request_to_start_smr(master) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(master, False) self.assertEqual(ret, 0, 'failed to start redis') max_try = 3 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(master, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(master) self.assertNotEqual( expected, state, 'server%d - state:%s, expected:not %s, role:%s' % (master['id'], state, expected, role)) util.log( 'success : the old master that has a greater commit-seq than the current master tried to join as a slave, but it is blocked successfully.' ) gw.disconnect() return 0
def test_4_PGS_mgen_is_less_than_PG_mgen(self): util.print_frame() # get gateway info ip, port = util.get_rand_gateway(self.cluster) gw = gateway_mgmt.Gateway(self.cluster['servers'][0]['id']) ret = gw.connect(ip, port) self.assertEqual(ret, 0, 'failed to connect to gateway, %s:%d' % (ip, port)) # initial data util.put_some_data(self.cluster) # shutdown server_to_join = util.get_server_by_role(self.cluster['servers'], 'master') ret = testbase.request_to_shutdown_smr(server_to_join) self.assertEqual(ret, 0, 'failed to shutdown smr') ret = testbase.request_to_shutdown_redis(server_to_join) self.assertEquals(ret, 0, 'failed to shutdown redis') # check state F max_try = 20 expected = 'F' for i in range(0, max_try): state = util.get_smr_state(server_to_join, self.leader_cm) if expected == state: break time.sleep(1) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s' % (server_to_join['id'], state, expected)) # set value key_base = 'mw' for i in range(0, 10000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') # master failover 1 (master generation + 1) util.log('master failover 1') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # check quorum (copy:3, quorum:1, available:2) ok = False for i in xrange(10): ok = util.check_quorum(self.cluster['cluster_name'], self.leader_cm['ip'], self.leader_cm['cm_port']) if ok: break else: time.sleep(1) self.assertTrue(ok, 'Check quorum fail.') # master failover 2 (master generation + 1) util.log('master failover 2') server = util.get_server_by_role(self.cluster['servers'], 'master') self.failover(server) # recovery util.log('master recovery start.') ret = testbase.request_to_start_smr(server_to_join) self.assertEqual(ret, 0, 'failed to start smr') ret = testbase.request_to_start_redis(server_to_join) self.assertEqual(ret, 0, 'failed to start redis') ret = testbase.wait_until_finished_to_set_up_role(server_to_join, 10) self.assertEquals( ret, 0, 'failed to role change. smr_id:%d' % (server_to_join['id'])) util.log('master recovery end successfully.') # check state N max_try = 20 expected = 'N' for i in range(0, max_try): state = util.get_smr_state(server, self.leader_cm) if expected == state: break time.sleep(1) role = util.get_role_of_server(server) self.assertEquals( expected, state, 'server%d - state:%s, expected:%s, role:%s' % (server['id'], state, expected, role)) time.sleep(5) # set value for i in range(10000, 20000): cmd = 'set %s%d %d\r\n' % (key_base, i, i) gw.write(cmd) res = gw.read_until('\r\n') self.assertEquals(res, '+OK\r\n') server = util.get_server_by_role(self.cluster['servers'], 'master') redis = redis_mgmt.Redis(server_to_join['id']) ret = redis.connect(server_to_join['ip'], server_to_join['redis_port']) self.assertEquals(ret, 0, 'failed to connect to redis') # check value for i in range(0, 20000): cmd = 'get %s%d\r\n' % (key_base, i) redis.write(cmd) redis.read_until('\r\n') response = redis.read_until('\r\n') self.assertEqual(response, '%d\r\n' % (i), 'inconsistent %s, %d' % (response[:-2], i)) gw.disconnect() return 0