async def _testMofNQuorum(self): config = self.config._replace(retry_timeout_milli=500) with bft_client.UdpClient(config, self.replicas, None) as udp_client: await udp_client.sendSync(self.writeRequest(5), False) single_read_q = bft_client.MofNQuorum([0, 1, 2, 3], 1) read = await udp_client.sendSync(self.readRequest(), True, m_of_n_quorum=single_read_q) self.assertEqual(5, self.read_val(read))
async def test_remove_nodes_with_f_failures(self, bft_network): """ In this test we show how a system operator can remove nodes (and thus reduce the cluster) from 7 nodes cluster to 4 nodes cluster even when f nodes are not responding For that the operator performs the following steps: 1. Stop 2 nodes (f=2) 2. Send a remove_node command - this command also wedges the system 3. Verify that all live (including the removed candidates) nodes have stopped 4. Load a new configuration to the bft network 5. Rerun the cluster with only 4 nodes and make sure they succeed to perform transactions in fast path """ bft_network.start_all_replicas() skvbc = kvbc.SimpleKVBCProtocol(bft_network) client = bft_network.random_client() for i in range(100): await skvbc.write_known_kv() # choose two replicas to crash and crash them crashed_replicas = {5, 6} # For simplicity, we crash the last two replicas bft_network.stop_replicas(crashed_replicas) # All next request should be go through the slow path for i in range(100): await skvbc.write_known_kv() key, val = await skvbc.write_known_kv() live_replicas = bft_network.all_replicas(without=crashed_replicas) await client.write(skvbc.write_req([], [], block_id=0, add_remove_node_command=True)) with trio.fail_after(seconds=90): done = False while done is False: msg = skvbc.get_have_you_stopped_req(n_of_n=0) rep = await client.read(msg, m_of_n_quorum=bft_client.MofNQuorum(live_replicas, len(live_replicas))) rsi_rep = client.get_rsi_replies() done = True for r in rsi_rep.values(): if skvbc.parse_rsi_reply(rep, r) == 0: done = False break checkpoint_to_stop_at = 300 for r in live_replicas: last_stable_checkpoint = await bft_network.get_metric(r, bft_network, "Gauges", "lastStableSeqNum") self.assertGreaterEqual(last_stable_checkpoint, checkpoint_to_stop_at) bft_network.stop_all_replicas() # We now expect the replicas to start with a fresh new configuration which means that we # need to see in the logs that isNewStorage() = true. Also, # we expect tp see that lastStableSeqNum = 0 (for example) conf = TestConfig(n=4, f=1, c=0, num_clients=30, key_file_prefix=KEY_FILE_PREFIX, start_replica_cmd=start_replica_cmd, stop_replica_cmd=None, num_ro_replicas=0) bft_network.change_configuration(conf) bft_network.start_all_replicas() for r in bft_network.all_replicas(): last_stable_checkpoint = await bft_network.get_metric(r, bft_network, "Gauges", "lastStableSeqNum") self.assertEqual(last_stable_checkpoint, 0) await self.validate_state_consistency(skvbc, key, val) for i in range(100): await skvbc.write_known_kv() for r in bft_network.all_replicas(): assert (r < 4) num_of_fast_path = await bft_network.get_metric(r, bft_network, "Counters", "totalFastPaths") self.assertGreater(num_of_fast_path, 0)
async def test_wedge_command_with_f_failures(self, bft_network): """ This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism is able to stop at the super stable checkpoint. The test does the following: 1. Start all replicas but 2 2. A client sends a wedge command 3. Validate that all started replicas have reached the wedge point 4. Restart the live replicas and validate the system is able to make progress 5. Start the late replica 6. Validate that the late replicas completed the state transfer 7. Join the late replicas to the quorum and make sure the system is able to make progress """ initial_prim = 0 late_replicas = bft_network.random_set_of_replicas(2, {initial_prim}) on_time_replicas = bft_network.all_replicas(without=late_replicas) bft_network.start_replicas(on_time_replicas) skvbc = kvbc.SimpleKVBCProtocol(bft_network) await skvbc.wait_for_liveness() checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0) client = bft_network.random_client() # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds client.config._replace(req_timeout_milli=10000) with log.start_action(action_type="send_wedge_cmd", checkpoint_before=checkpoint_before, late_replicas=list(late_replicas)): op = operator.Operator(bft_network.config, client, bft_network.builddir) await op.wedge() with trio.fail_after(seconds=60): done = False while done is False: await op.wedge_status(quorum=bft_client.MofNQuorum( on_time_replicas, len(on_time_replicas)), fullWedge=False) rsi_rep = client.get_rsi_replies() done = True for r in rsi_rep.values(): res = cmf_msgs.ReconfigurationResponse.deserialize(r) status = res[0].response.stopped if status is False: done = False break # Make sure the system is able to make progress bft_network.stop_replicas(on_time_replicas) bft_network.start_replicas(on_time_replicas) for i in range(100): await skvbc.write_known_kv() # Start late replicas and wait for state transfer to stop bft_network.start_replicas(late_replicas) await bft_network.wait_for_state_transfer_to_start() for r in late_replicas: await bft_network.wait_for_state_transfer_to_stop( initial_prim, r, stop_on_stable_seq_num=True) replicas_to_stop = bft_network.random_set_of_replicas( 2, late_replicas | {initial_prim}) # Make sure the system is able to make progress for i in range(100): await skvbc.write_known_kv()