def test_etcd_heartbeat_timeout(startcredis_etcdonly): """ Test that failure is detected and repaired within a heartbeat timeout. """ # Start members with a quick heartbeat timeout. common.Start( chain=common.MakeChain(3), master_mode=MASTER_ETCD, heartbeat_interval=1, heartbeat_timeout=2) # Launch driver thread. Note that it will take a minimum of 10 seconds. n = 10 sleep_secs = 1 driver = multiprocessing.Process(target=SeqPut, args=(n, sleep_secs)) driver.start() time.sleep(0.1) middle_port = common.PortForNode(1) common.KillNode(index=1) # Don't notify master # Heartbeat should expire within 2 sec. driver.join() assert ops_completed.value == n import pdb; pdb.set_trace() chain = master_client.execute_command('MASTER.GET_CHAIN') assert len(chain) == 2 - 1 + 1, 'chain %s' % chain Check(ops_completed.value)
def test_etcd_master_recovery(startcredis_etcdonly): """ Test that the master can recover its state from etcd. """ common.Start( chain=common.MakeChain(3), master_mode=MASTER_ETCD, heartbeat_interval=1, heartbeat_timeout=10) chain = master_client.execute_command('MASTER.GET_CHAIN') head = master_client.execute_command('MASTER.REFRESH_HEAD') tail = master_client.execute_command('MASTER.REFRESH_TAIL') assert len(chain) == 3, 'chain %s' % chain common.KillMaster() time.sleep(0.2) common.StartMaster(master_mode=MASTER_ETCD) time.sleep(0.1) assert chain == master_client.execute_command('MASTER.GET_CHAIN') assert head == master_client.execute_command('MASTER.REFRESH_HEAD') assert tail == master_client.execute_command('MASTER.REFRESH_TAIL') new_node, _ = common.AddNode(master_client) # Sanity check that normal operation can continue. assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 4 new_node.kill()
def test_etcd_kill_middle(startcredis_etcdonly): """ Test that if the middle node is removed, the tail continues to get updates once the chain is repaired. """ # Start members with a quick heartbeat timeout. common.Start( chain=common.MakeChain(3), master_mode=MASTER_ETCD, heartbeat_interval=1, heartbeat_timeout=2) # Launch driver thread. n = 100 sleep_secs = 0.1 driver = multiprocessing.Process(target=SeqPut, args=(n, sleep_secs)) driver.start() time.sleep(0.1) middle_port = common.PortForNode(1) common.KillNode(index=1, notify=master_client) driver.join() assert ops_completed.value == n chain = master_client.execute_command('MASTER.GET_CHAIN') assert len(chain) == 2 - 1 + 1, 'chain %s' % chain Check(ops_completed.value)
def testCkptOnly(self): common.Start(gcs_mode=common.GCS_CKPTONLY) self.ack_client.execute_command('TAIL.CHECKPOINT') with self.assertRaises(redis.exceptions.ResponseError) as ctx: self.head_client.execute_command('HEAD.FLUSH') self.assertTrue( 'GcsMode is NOT set to kCkptFlush' in str(ctx.exception))
def testNormal(self): common.Start() # By default, the execution mode is kNormal, which disallows flush/ckpt. with self.assertRaises(redis.exceptions.ResponseError) as ctx: self.ack_client.execute_command('TAIL.CHECKPOINT') self.assertTrue('GcsMode is set to kNormal' in str(ctx.exception)) with self.assertRaises(redis.exceptions.ResponseError) as ctx: self.head_client.execute_command('HEAD.FLUSH') self.assertTrue( 'GcsMode is NOT set to kCkptFlush' in str(ctx.exception))
def BenchVanillaRedis(num_ops): common.Start(chain=common.MakeChain(1)) time.sleep(0.1) r = AckClient() # Just use the chain node as a regular redis server. start = time.time() for i in range(num_ops): i_str = str(i) # Serialize once. r.execute_command('SET', i_str, i_str) total_secs = time.time() - start common.log('throughput %.1f writes/sec; latency (us): mean %.5f std ? num %d' % (num_ops * 1.0 / total_secs, total_secs * 1e6 / num_ops, num_ops))
def BenchCredis(num_nodes, num_ops, num_clients, master_mode): common.Start(chain=common.MakeChain(num_nodes), master_mode=master_mode) time.sleep(0.1) # TODO(zongheng): ops_completed needs to be changed assert num_clients == 1 drivers = [] for i in range(num_clients): drivers.append( multiprocessing.Process(target=SeqPut, args=(num_ops, 0))) for driver in drivers: driver.start() for driver in drivers: driver.join() assert ops_completed.value == num_ops Check(ops_completed.value)
def testAck(self): common.Start() head_client = redis.StrictRedis("127.0.0.1", 6370) tail_client = redis.StrictRedis("127.0.0.1", 6371) # The ack client needs to be separate, since subscriptions # are blocking ack_client = redis.StrictRedis("127.0.0.1", 6371) p = ack_client.pubsub(ignore_subscribe_messages=True) p.subscribe(_CLIENT_ID) time.sleep(0.5) p.get_message() ssn = head_client.execute_command("MEMBER.PUT", "task_spec", "some_random_value", _CLIENT_ID) time.sleep(0.5) put_ack = p.get_message() assert ssn == 0 assert int(put_ack["data"]) == ssn # Check the sequence number
def test_etcd_master_online_recovery(startcredis_etcdonly): """ Test that SeqPut succeeds when the master is killed and restarted mid-way, then a member is killed, then a member is added. The restarted master should be able to recover the chain, with the new member being the tail, and no updates should be lost. """ common.Start( chain=common.MakeChain(3), master_mode=MASTER_ETCD, heartbeat_interval=1, heartbeat_timeout=10) # Launch driver thread. Note that it will take a minimum of 10 seconds. n = 10 sleep_secs = 1 driver = multiprocessing.Process(target=SeqPut, args=(n, sleep_secs)) driver.start() time.sleep(0.1) common.KillMaster() common.StartMaster(master_mode=MASTER_ETCD) time.sleep(0.1) assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 3 time.sleep(0.1) middle_port = common.PortForNode(1) common.KillNode(index=1, notify=master_client) assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 2 new_node, _ = common.AddNode(master_client, master_mode=MASTER_ETCD) time.sleep(0.1) driver.join() assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 3 # Heartbeat should expire within 2 sec. driver.join() assert ops_completed.value == n Check(ops_completed.value) # Cleanup new_node.kill()
def test_etcd_kill_node_while_master_is_dead(startcredis_etcdonly): """ Test that SeqPut succeeds when the master is killed and a node is killed WHILE the master is dead. The master is then restarted. No updates should be lost. TODO: Fails (3/28) because members are not checked for liveness when the master starts up. """ # Choose a long heartbeat timeout so that the master never receives heartbeat expiry notifs. common.Start( chain=common.MakeChain(3), master_mode=MASTER_ETCD, heartbeat_interval=1, heartbeat_timeout=999) # Launch driver thread. Note that it will take a minimum of 10 seconds. n = 10 sleep_secs = 1 driver = multiprocessing.Process(target=SeqPut, args=(n, sleep_secs)) driver.start() time.sleep(0.1) common.KillMaster() common.KillNode(index=1) common.StartMaster(master_mode=MASTER_ETCD) time.sleep(0.2) assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 2 new_node, _ = common.AddNode(master_client, master_mode=MASTER_ETCD) time.sleep(0.1) assert len(master_client.execute_command('MASTER.GET_CHAIN')) == 3 driver.join() assert ops_completed.value == n Check(ops_completed.value) # Cleanup new_node.kill()
def testBasics(self): common.Start(gcs_mode=common.GCS_CKPTFLUSH) self.head_client.execute_command('MEMBER.PUT', 'k1', 'v1', _CLIENT_ID) self.assertEqual(b'v1', self.ack_client.execute_command('READ', 'k1')) # 1 entry checkpointed. self.assertEqual(1, self.ack_client.execute_command('TAIL.CHECKPOINT')) # 0 entry checkpointed. self.assertEqual(0, self.ack_client.execute_command('TAIL.CHECKPOINT')) self.head_client.execute_command('MEMBER.PUT', 'k1', 'v2', _CLIENT_ID) self.assertEqual(1, self.ack_client.execute_command('TAIL.CHECKPOINT')) self.head_client.execute_command('MEMBER.PUT', 'k1', 'v3', _CLIENT_ID) # Process k1 (first seqnum). Physically, 0 key has been flushed out of # _redis_ memory state, because k1 has 2 dirty writes. self.assertEqual(0, self.head_client.execute_command('HEAD.FLUSH')) # Process k1 (second seqnum). self.assertEqual(0, self.head_client.execute_command('HEAD.FLUSH')) # It remains in memory because of a dirty write (k1, v3). self.assertEqual(b'v3', self.ack_client.execute_command('GET k1')) # Now all seqnums checkpointed. self.assertEqual(1, self.ack_client.execute_command('TAIL.CHECKPOINT')) # Process k1 (3rd seqnum). 1 means it's physically flushed. self.assertEqual(1, self.head_client.execute_command('HEAD.FLUSH')) # Check that redis's native GET returns nothing. self.assertIsNone(self.ack_client.execute_command('GET k1')) # READ is credis' read mechanism, can read checkpoints. self.assertEqual(b'v3', self.ack_client.execute_command('READ k1'))
def testCannotFlush(self): common.Start(gcs_mode=common.GCS_CKPTFLUSH) r = self.head_client.execute_command('HEAD.FLUSH') self.assertEqual(0, r)
def testCkptFlush(self): common.Start(gcs_mode=common.GCS_CKPTFLUSH) self.ack_client.execute_command('TAIL.CHECKPOINT') self.head_client.execute_command('HEAD.FLUSH')