async def _test_slow_path_view_change(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) for _ in range(10): await skvbc.write_known_kv() await bft_network.assert_fast_path_prevalent() bft_network.stop_replica(0) with trio.move_on_after(seconds=5): async with trio.open_nursery() as nursery: nursery.start_soon( skvbc.send_indefinite_write_requests) bft_network.start_replica(0) await self._wait_for_slow_path_after_view_change( bft_network, as_of_seq_num=10)
async def _test_read_written_data_after_restart_of_all_nodes(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) key = skvbc.random_key() value = skvbc.random_value() kv = (key, value) write_kv_msg = skvbc.write_req([], [kv], 0) client = bft_network.random_client() await client.write(write_kv_msg) bft_network.stop_all_replicas() bft_network.start_all_replicas() read_key_msg = skvbc.read_req([key]) reply = await client.read(read_key_msg) kv_reply = skvbc.parse_reply(reply) self.assertEqual({key: value}, kv_reply)
async def _test_state_transfer(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_node = random.choice( bft_network.all_replicas(without={0})) await skvbc.prime_for_state_transfer( stale_nodes={stale_node}, persistency_enabled=False ) bft_network.start_replica(stale_node) await bft_network.wait_for_state_transfer_to_start() await bft_network.wait_for_state_transfer_to_stop(0, stale_node) await skvbc.assert_successful_put_get(self) random_replica = random.choice( bft_network.all_replicas(without={0, stale_node})) bft_network.stop_replica(random_replica) await skvbc.assert_successful_put_get(self)
async def _test_single_vc_only_primary_down(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) initial_primary = 0 expected_next_primary = 1 await self._send_random_writes(skvbc) await bft_network.wait_for_view_change( replica_id=initial_primary, expected=lambda v: v == initial_primary, err_msg="Make sure we are in the initial view " "before crashing the primary." ) bft_network.stop_replica(initial_primary) await self._send_random_writes(skvbc) await bft_network.wait_for_view_change( replica_id=random.choice(bft_network.all_replicas(without={0})), expected=lambda v: v == expected_next_primary, err_msg="Make sure view change has been triggered." )
async def _test_checkpoints_saved_and_transferred(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_node = random.choice(list(set(range(config.n)) - {0})) client, known_key, known_kv = \ await skvbc.prime_for_state_transfer(stale_nodes={stale_node}) # Start the replica without any data, and wait for state transfer to # complete. bft_network.start_replica(stale_node) await bft_network.wait_for_state_transfer_to_start() up_to_date_node = 0 await bft_network.wait_for_state_transfer_to_stop( up_to_date_node, stale_node) bft_network.force_quorum_including_replica(stale_node) # Retrieve the value we put first to ensure state transfer worked # when the log went away kvpairs = await client.read([known_key]) self.assertDictEqual(dict(known_kv), kvpairs) # Perform a put/get transaction pair to ensure we can read newly # written data after state transfer. await skvbc.assert_successful_put_get(self)
async def _test_st_when_fetcher_and_sender_crash(self): for bft_config in bft.interesting_configs(f_min=2): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_node = random.choice(list(set(range(config.n)) - {0})) client, known_key, known_kv = \ await skvbc.prime_for_state_transfer( checkpoints_num=4, stale_nodes={stale_node} ) # exclude the primary and the stale node unstable_replicas = list( set(range(0, config.n)) - {0} - {stale_node}) await self._run_state_transfer_while_crashing_non_primary( bft_network=bft_network, primary=0, stale=stale_node, unstable_replicas=unstable_replicas) bft_network.force_quorum_including_replica(stale_node) # Retrieve the value we put first to ensure state transfer worked # when the log went away kvpairs = await client.read([known_key]) self.assertDictEqual(dict(known_kv), kvpairs)
async def _test_wreak_havoc(self): num_ops = 500 for c in bft.interesting_configs(): print(f"\n\nStarting test with configuration={c}", flush=True) config = bft.TestConfig(c['n'], c['f'], c['c'], c['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: self.skvbc = kvbc.SimpleKVBCProtocol(bft_network) init_state = self.skvbc.initial_state() self.tracker = skvbc_history_tracker.SkvbcTracker(init_state) self.bft_network = bft_network self.status = Status(c) await bft_network.init() bft_network.start_all_replicas() async with trio.open_nursery() as nursery: nursery.start_soon(self.run_concurrent_ops, num_ops) nursery.start_soon(self.crash_primary) await self.verify() time.sleep(2)
async def _test_fast_to_slow_path_transition(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) for _ in range(10): await skvbc.write_known_kv() await bft_network.assert_fast_path_prevalent() unstable_replicas = list(set(range(0, config.n)) - {0}) bft_network.stop_replica( replica=random.choice(unstable_replicas)) for _ in range(10): key, val = await skvbc.write_known_kv() await bft_network.assert_slow_path_prevalent(as_of_seq_num=10) await skvbc.assert_kv_write_executed(key, val)
async def _test_fast_path_resilience_to_crashes(self): for bft_config in bft.interesting_configs(c_min=1): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) unstable_replicas = list(set(range(0, config.n)) - {0}) for _ in range(config.c): replica_to_stop = random.choice(unstable_replicas) bft_network.stop_replica(replica_to_stop) # make sure we first downgrade to the slow path... for _ in range(self.evaluation_period_seq_num): await skvbc.write_known_kv() await bft_network.assert_slow_path_prevalent() # ...but eventually (after the evaluation period), the fast path is restored! for _ in range(self.evaluation_period_seq_num + 1, self.evaluation_period_seq_num * 2): key, val = await skvbc.write_known_kv() await bft_network.assert_fast_path_prevalent( as_of_seq_num=self.evaluation_period_seq_num + 1, nb_slow_paths_so_far=self.evaluation_period_seq_num) await skvbc.assert_kv_write_executed(key, val)
async def _test_conflicting_write(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) key = skvbc.random_key() write_1 = skvbc.write_req(readset=[], writeset=[(key, skvbc.random_value()) ], block_id=0) write_2 = skvbc.write_req(readset=[], writeset=[(key, skvbc.random_value()) ], block_id=0) client = bft_network.random_client() await client.write(write_1) last_write_reply = \ skvbc.parse_reply(await client.write(write_2)) last_block_id = last_write_reply.last_block_id key_prime = skvbc.random_key() # this write is conflicting because the writeset (key_prime) is # based on an outdated version of the readset (key) conflicting_write = skvbc.write_req(readset=[key], writeset=[ (key_prime, skvbc.random_value()) ], block_id=last_block_id - 1) write_result = \ skvbc.parse_reply(await client.write(conflicting_write)) successful_write = write_result.success self.assertTrue(not successful_write)
async def _test_view_change_transitions_safely_without_quorum(self): for bft_config in bft.interesting_configs(f_min=2): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() skvbc = kvbc.SimpleKVBCProtocol(bft_network) [bft_network.start_replica(i) for i in range(1, config.n - 1)] with trio.fail_after(60): # seconds async with trio.open_nursery() as nursery: nursery.start_soon( skvbc.send_indefinite_write_requests) # See if replica 1 has become the new primary # Check every .5 seconds view = await self._get_view_number( bft_network=bft_network, replica_id=1, expected=lambda v: v == 1) # At this point a view change has successfully completed # with node 1 as the primary. The faulty assertion should # have crashed old nodes. # # In case the nodes didn't crash, stop node 1 to trigger # another view change. Starting node 0 should allow the view # change to succeed. If there is a timeout then the other # nodes have likely crashed due to the faulty assertion. The # crash will show in the logs when running the test # verbosely: # # 21: INFO 2019-08-30skvbc_replica: # /home/andrewstone/concord-bft.py/bftengine/src/bftengine/PersistentStorageImp.cpp:881: # void # bftEngine::impl::PersistentStorageImp::verifySetDescriptorOfLastExitFromView(const # bftEngine::impl::DescriptorOfLastExitFromView &): # Assertion `false' failed. bft_network.stop_replica(1) bft_network.start_replica(0) while True: with trio.move_on_after(.5): # seconds key = ['replica', 'Gauges', 'lastAgreedView'] replica_id = 2 view = await bft_network.metrics.get( replica_id, *key) if view == 2: # success! nursery.cancel_scope.cancel()
async def _test_st_while_crashing_primary(self, trigger_view_change, crash_repeatedly): # we need a BFT network with f >= 2, allowing us to have 2 # crashed replicas at the same time (the primary and the stale node) for bft_config in bft.interesting_configs(f_min=2): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_replica = config.n - 1 client, known_key, known_kv = \ await skvbc.prime_for_state_transfer( checkpoints_num=4, stale_nodes={stale_replica} ) view = await self._get_view_number(bft_network=bft_network, replica_id=0, expected=lambda v: v == 0) self.assertEqual(view, 0, "Make sure we are in the initial view.") print(f'Initial view number is {view}, as expected.') if crash_repeatedly: await self._run_state_transfer_while_crashing_primary_repeatedly( skvbc=skvbc, bft_network=bft_network, n=config.n, primary=0, stale=stale_replica) else: await self._run_state_transfer_while_crashing_primary_once( skvbc=skvbc, bft_network=bft_network, n=config.n, primary=0, stale=stale_replica, trigger_view_change=trigger_view_change) bft_network.force_quorum_including_replica(stale_replica) kvpairs = await client.read([known_key]) self.assertDictEqual(dict(known_kv), kvpairs)
async def _test_multiple_vc_slow_path(self): # Here, we require that c < f because: # A) for view change we need at least n-f = 2f+2c+1 replicas # B) to ensure transition to the slow path, we need to crash at least c+1 replicas. # Combining A) and B) yields n-(c+1) >= 2f+2c+1, equivalent to c < f for bft_config in bft.interesting_configs(lambda n, f, c: c < f): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) current_primary = 0 for _ in range(3): self.assertEqual(len(bft_network.procs), config.n, "Make sure all replicas are up initially.") crashed_replicas = await self._crash_replicas_including_primary( bft_network=bft_network, nb_crashing=config.c+1, primary=current_primary ) self.assertGreaterEqual( len(bft_network.procs), 2 * config.f + 2 * config.c + 1, "Make sure enough replicas are up to allow a successful view change") await self._send_random_writes(skvbc) stable_replica = random.choice( bft_network.all_replicas(without=crashed_replicas)) view = await bft_network.wait_for_view_change( replica_id=stable_replica, expected=lambda v: v > current_primary, err_msg="Make sure a view change has been triggered." ) current_primary = view [bft_network.start_replica(i) for i in crashed_replicas] await bft_network.wait_for_slow_path_to_be_prevalent( replica_id=current_primary)
async def _test_get_block_data(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() last_block = skvbc.parse_reply(await client.read( skvbc.get_last_block_req())) # Perform an unconditional KV put. # Ensure keys aren't identical kv = [(skvbc.keys[0], skvbc.random_value()), (skvbc.keys[1], skvbc.random_value())] reply = await client.write(skvbc.write_req([], kv, 0)) reply = skvbc.parse_reply(reply) self.assertTrue(reply.success) self.assertEqual(last_block + 1, reply.last_block_id) last_block = reply.last_block_id # Get the kvpairs in the last written block data = await client.read(skvbc.get_block_data_req(last_block)) kv2 = skvbc.parse_reply(data) self.assertDictEqual(kv2, dict(kv)) # Write another block with the same keys but (probabilistically) # different data kv3 = [(skvbc.keys[0], skvbc.random_value()), (skvbc.keys[1], skvbc.random_value())] reply = await client.write(skvbc.write_req([], kv3, 0)) reply = skvbc.parse_reply(reply) self.assertTrue(reply.success) self.assertEqual(last_block + 1, reply.last_block_id) # Get the kvpairs in the previously written block data = await client.read(skvbc.get_block_data_req(last_block)) kv2 = skvbc.parse_reply(data) self.assertDictEqual(kv2, dict(kv))
async def _test_fast_path_read_your_write(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) for _ in range(10): key, val = await skvbc.write_known_kv() await bft_network.assert_fast_path_prevalent() await skvbc.assert_kv_write_executed(key, val)
async def _test_st_when_fetcher_crashes(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: skvbc = kvbc.SimpleKVBCProtocol(bft_network) stale_node = random.choice( bft_network.all_replicas(without={0})) client, known_key, known_kv = \ await skvbc.prime_for_state_transfer(stale_nodes={stale_node}) # Start the empty replica, wait for it to start fetching, then stop # it. bft_network.start_replica(stale_node) await bft_network.wait_for_fetching_state(stale_node) bft_network.stop_replica(stale_node) # Loop repeatedly starting and killing the destination replica after # state transfer has started. On each restart, ensure the node is # still fetching or that it has received all the data. await self._fetch_or_finish_state_transfer_while_crashing( bft_network, 0, stale_node) # Restart the replica and wait for state transfer to stop bft_network.start_replica(stale_node) await bft_network.wait_for_state_transfer_to_stop( 0, stale_node) bft_network.force_quorum_including_replica(stale_node) # Retrieve the value we put first to ensure state transfer worked # when the log went away kvpairs = await client.read([known_key]) self.assertDictEqual(dict(known_kv), kvpairs) # Perform a put/get transaction pair to ensure we can read newly # written data after state transfer. await skvbc.assert_successful_put_get(self)
async def _test_healthy(self): num_ops = 500 for c in bft.interesting_configs(): config = bft.TestConfig(c['n'], c['f'], c['c'], c['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: self.skvbc = kvbc.SimpleKVBCProtocol(bft_network) init_state = self.skvbc.initial_state() self.tracker = skvbc_history_tracker.SkvbcTracker(init_state) self.bft_network = bft_network self.status = Status(c) await bft_network.init() bft_network.start_all_replicas() async with trio.open_nursery() as nursery: nursery.start_soon(self.run_concurrent_ops, num_ops) await self.verify()
async def _test_persistent_slow_path(self): for bft_config in bft.interesting_configs(lambda n, f, c: c == 0): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) unstable_replicas = bft_network.all_replicas(without={0}) bft_network.stop_replica( replica=random.choice(unstable_replicas)) for _ in range(self.evaluation_period_seq_num * 2): key, val = await skvbc.write_known_kv() await bft_network.assert_slow_path_prevalent(as_of_seq_num=1) await skvbc.assert_kv_write_executed(key, val)
async def _test_single_vc_with_f_replicas_down(self): for bft_config in bft.interesting_configs(): config = bft.TestConfig(n=bft_config['n'], f=bft_config['f'], c=bft_config['c'], num_clients=bft_config['num_clients'], key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd) with bft.BftTestNetwork(config) as bft_network: await bft_network.init() bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) self.assertEqual(len(bft_network.procs), config.n, "Make sure all replicas are up initially.") initial_primary = 0 expected_next_primary = 1 crashed_replicas = await self._crash_replicas_including_primary( bft_network=bft_network, nb_crashing=config.f, primary=initial_primary ) self.assertGreaterEqual( len(bft_network.procs), 2 * config.f + 2 * config.c + 1, "Make sure enough replicas are up to allow a successful view change") await self._send_random_writes(skvbc) await bft_network.wait_for_view_change( replica_id=random.choice(bft_network.all_replicas(without=crashed_replicas)), expected=lambda v: v == expected_next_primary, err_msg="Make sure view change has been triggered." )