Exemple #1
0
 async def _testMofNQuorum(self):
     config = self.config._replace(retry_timeout_milli=500)
     with bft_client.UdpClient(config, self.replicas, None) as udp_client:
         await udp_client.sendSync(self.writeRequest(5), False)
         single_read_q = bft_client.MofNQuorum([0, 1, 2, 3], 1)
         read = await udp_client.sendSync(self.readRequest(), True, m_of_n_quorum=single_read_q)
         self.assertEqual(5, self.read_val(read))
    async def test_remove_nodes_with_f_failures(self, bft_network):
        """
        In this test we show how a system operator can remove nodes (and thus reduce the cluster) from 7 nodes cluster
        to 4 nodes cluster even when f nodes are not responding
        For that the operator performs the following steps:
        1. Stop 2 nodes (f=2)
        2. Send a remove_node command - this command also wedges the system
        3. Verify that all live (including the removed candidates) nodes have stopped
        4. Load  a new configuration to the bft network
        5. Rerun the cluster with only 4 nodes and make sure they succeed to perform transactions in fast path
        """
        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        client = bft_network.random_client()

        for i in range(100):
            await skvbc.write_known_kv()
        # choose two replicas to crash and crash them
        crashed_replicas = {5, 6} # For simplicity, we crash the last two replicas
        bft_network.stop_replicas(crashed_replicas)

        # All next request should be go through the slow path
        for i in range(100):
            await skvbc.write_known_kv()

        key, val = await skvbc.write_known_kv()

        live_replicas = bft_network.all_replicas(without=crashed_replicas)

        await client.write(skvbc.write_req([], [], block_id=0, add_remove_node_command=True))

        with trio.fail_after(seconds=90):
            done = False
            while done is False:
                msg = skvbc.get_have_you_stopped_req(n_of_n=0)
                rep = await client.read(msg, m_of_n_quorum=bft_client.MofNQuorum(live_replicas, len(live_replicas)))
                rsi_rep = client.get_rsi_replies()
                done = True
                for r in rsi_rep.values():
                    if skvbc.parse_rsi_reply(rep, r) == 0:
                        done = False
                        break

        checkpoint_to_stop_at = 300
        for r in live_replicas:
            last_stable_checkpoint = await bft_network.get_metric(r, bft_network, "Gauges", "lastStableSeqNum")
            self.assertGreaterEqual(last_stable_checkpoint, checkpoint_to_stop_at)

        bft_network.stop_all_replicas()
        # We now expect the replicas to start with a fresh new configuration which means that we
        # need to see in the logs that isNewStorage() = true. Also,
        # we expect tp see that lastStableSeqNum = 0 (for example)

        conf = TestConfig(n=4,
                          f=1,
                          c=0,
                          num_clients=30,
                          key_file_prefix=KEY_FILE_PREFIX,
                          start_replica_cmd=start_replica_cmd,
                          stop_replica_cmd=None,
                          num_ro_replicas=0)
        bft_network.change_configuration(conf)

        bft_network.start_all_replicas()
        for r in bft_network.all_replicas():
            last_stable_checkpoint = await bft_network.get_metric(r, bft_network, "Gauges", "lastStableSeqNum")
            self.assertEqual(last_stable_checkpoint, 0)

        await self.validate_state_consistency(skvbc, key, val)

        for i in range(100):
            await skvbc.write_known_kv()

        for r in bft_network.all_replicas():
            assert (r < 4)
            num_of_fast_path = await bft_network.get_metric(r, bft_network, "Counters", "totalFastPaths")
            self.assertGreater(num_of_fast_path, 0)
    async def test_wedge_command_with_f_failures(self, bft_network):
        """
            This test checks that even a replica that received the super stable checkpoint via the state transfer mechanism
            is able to stop at the super stable checkpoint.
            The test does the following:
            1. Start all replicas but 2
            2. A client sends a wedge command
            3. Validate that all started replicas have reached the wedge point
            4. Restart the live replicas and validate the system is able to make progress
            5. Start the late replica
            6. Validate that the late replicas completed the state transfer
            7. Join the late replicas to the quorum and make sure the system is able to make progress
        """
        initial_prim = 0
        late_replicas = bft_network.random_set_of_replicas(2, {initial_prim})
        on_time_replicas = bft_network.all_replicas(without=late_replicas)
        bft_network.start_replicas(on_time_replicas)

        skvbc = kvbc.SimpleKVBCProtocol(bft_network)
        await skvbc.wait_for_liveness()

        checkpoint_before = await bft_network.wait_for_checkpoint(replica_id=0)

        client = bft_network.random_client()
        # We increase the default request timeout because we need to have around 300 consensuses which occasionally may take more than 5 seconds
        client.config._replace(req_timeout_milli=10000)
        with log.start_action(action_type="send_wedge_cmd",
                              checkpoint_before=checkpoint_before,
                              late_replicas=list(late_replicas)):
            op = operator.Operator(bft_network.config, client,
                                   bft_network.builddir)
            await op.wedge()

        with trio.fail_after(seconds=60):
            done = False
            while done is False:
                await op.wedge_status(quorum=bft_client.MofNQuorum(
                    on_time_replicas, len(on_time_replicas)),
                                      fullWedge=False)
                rsi_rep = client.get_rsi_replies()
                done = True
                for r in rsi_rep.values():
                    res = cmf_msgs.ReconfigurationResponse.deserialize(r)
                    status = res[0].response.stopped
                    if status is False:
                        done = False
                        break

        # Make sure the system is able to make progress
        bft_network.stop_replicas(on_time_replicas)
        bft_network.start_replicas(on_time_replicas)
        for i in range(100):
            await skvbc.write_known_kv()

        # Start late replicas and wait for state transfer to stop
        bft_network.start_replicas(late_replicas)

        await bft_network.wait_for_state_transfer_to_start()
        for r in late_replicas:
            await bft_network.wait_for_state_transfer_to_stop(
                initial_prim, r, stop_on_stable_seq_num=True)

        replicas_to_stop = bft_network.random_set_of_replicas(
            2, late_replicas | {initial_prim})

        # Make sure the system is able to make progress
        for i in range(100):
            await skvbc.write_known_kv()