Example #1
0
    async def test_inactive_window(self, bft_network):
        """
        The goal of this test is to verify full catch up of a Replica only from the Inactive Window.
        1) Start all Replicas without Replica 1, which will later catch up from the Primary's Inactive Window.
        2) Advance all Replicas to 1 sequence number beyond the first stable and verify they have all collected
           Stable Checkpoints.
        3) Start and isolate the late Replica 1 form all others except the Primary. This way it will not be able
           to start State Transfer and will only be able to catch up from the Primary's Inactive Window.
        4) Verify that Replica 1 has managed to catch up.
        """

        late_replica = 1

        bft_network.start_replicas(
            bft_network.all_replicas(without={late_replica}))
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        stable_checkpoint_to_reach = 1
        num_reqs_to_catch_up = 151

        async def write_req(num_req=1):
            for _ in range(num_req):
                await skvbc.write_known_kv()

        # create checkpoint and wait for checkpoint propagation
        await skvbc.fill_and_wait_for_checkpoint(
            initial_nodes=bft_network.get_live_replicas(),
            num_of_checkpoints_to_add=stable_checkpoint_to_reach,
            verify_checkpoint_persistency=False)

        await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
            bft_network.get_live_replicas(), stable_checkpoint_to_reach)

        with trio.fail_after(seconds=30):
            with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(
                    bft_network, {1}, {6, 5, 4, 3, 2}) as adversary:
                adversary.interfere()

                bft_network.start_replica(late_replica)

                late_replica_catch_up = False
                while not late_replica_catch_up:
                    for replica_id in bft_network.all_replicas():
                        last_stable = await bft_network.get_metric(
                            replica_id, bft_network, 'Gauges',
                            "lastStableSeqNum")
                        last_exec = await bft_network.get_metric(
                            replica_id, bft_network, 'Gauges',
                            "lastExecutedSeqNum")
                        log.log_message(
                            message_type=
                            f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}"
                        )
                        if replica_id == late_replica and last_exec >= num_reqs_to_catch_up:
                            late_replica_catch_up = True

                    await write_req()
                    await trio.sleep(seconds=3)
    async def test_view_change_with_isolated_replicas(self, bft_network, tracker):
        """
        test View Changes with multiple View increments, where the
        isolated F-1 expected next primaries will not be able to step in as
        primaries, but will activate the corresponding view for which it is
        theirs turn to become Primary.

        Step by step scenario:
        1. Use a one way isolating adversary to isolate the F-1 replicas after the current primary in such a way that they cannot send messages to the peers, but can receive messages from them.
        2. Stop the current primary.
        3. Send Client requests to trigger a View Change.
        4. Wait for the system to finish View Change. Note that multiple View increments will happen.
        5. Drop the network adversary and verify Fast Commit Path is recovered in the system by introducing client requests.

        We can perform this test in a loop multiple times.
        """
        # log = foo()
        # start replicas
        [bft_network.start_replica(i) for i in bft_network.all_replicas()]

        skvbc = kvbc.SimpleKVBCProtocol(bft_network, tracker)
        loop_count = 0
        while (loop_count < loops):
            loop_count = loop_count + 1

            primary = await bft_network.get_current_primary()

            index_list = range(primary + 1, primary + bft_network.config.f)
            replicas_to_isolate = []
            for i in index_list:
                replicas_to_isolate.append(i % bft_network.config.n)

            other_replicas = bft_network.all_replicas(without=set(replicas_to_isolate))

            view = await bft_network.get_current_view()

            with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(bft_network, other_replicas, replicas_to_isolate) as adversary:
                adversary.interfere()

                bft_network.stop_replica(primary)
                await skvbc.run_concurrent_ops(10)

                await bft_network.wait_for_replicas_to_reach_at_least_view(other_replicas, expected_view=view + bft_network.config.f, timeout=15 + timeouts)

            bft_network.start_replica(primary)

            await bft_network.wait_for_fast_path_to_be_prevalent(
                run_ops=lambda: skvbc.run_concurrent_ops(num_ops=20, write_weight=1), threshold=20)
Example #3
0
    async def test_inactive_window_catchup_up_to_gap(self, bft_network):
        """
        In this test we check the catchup from Inactive Window when we have a gap related to the Peers.
        The situation can happen if the catching up Replica's last Stable SeqNo is 3 Checkpoints behind its Peers, but
        its Last Executed is only 2 Checkpoints behind.
        Steps to recreate:
        1) Start all replicas.
        2) Isolate 1 Replica from all but the Primary. We will call it Late Replica.
        3) Advance all replicas beyond the first Stable Checkpoint. The Late Replica won't be able to collect a
           Stable Checkpoint.
        4) Stop the Late Replica and advance all others 2 more Checkpoints.
        5) Start the late Replica and verify it catches up to the end of its Working Window from the Inactive Windows of
           its Peers.
        """

        late_replica = 1
        primary = 0

        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        first_stable_checkpoint_to_reach = 1
        checkpoints_to_advance_after_first = 2
        seq_nums_per_checkpoint = 150
        num_reqs_after_first_checkpoint = 4

        async def write_req(num_req=1):
            for _ in range(num_req):
                await skvbc.send_write_kv_set()

        with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(
                bft_network, {late_replica},
                bft_network.all_replicas(without={primary, late_replica})) as adversary:
            adversary.interfere()

            # create checkpoint and wait for checkpoint propagation
            await skvbc.fill_and_wait_for_checkpoint(
                initial_nodes=bft_network.all_replicas(without={late_replica}),
                num_of_checkpoints_to_add=first_stable_checkpoint_to_reach,
                verify_checkpoint_persistency=False
            )

            await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
                bft_network.all_replicas(without={late_replica}),
                first_stable_checkpoint_to_reach)

            await write_req(num_reqs_after_first_checkpoint)

            # Wait for late_replica to reach num_reqs_after_first_checkpoint past the 1-st Checkpoint
            with trio.fail_after(seconds=30):
                while True:
                    last_exec = await bft_network.get_metric(late_replica, bft_network, 'Gauges', "lastExecutedSeqNum")
                    log.log_message(message_type=f"replica = {late_replica}; lase_exec = {last_exec}")
                    if last_exec == seq_nums_per_checkpoint + num_reqs_after_first_checkpoint:
                        break
                    await trio.sleep(seconds=0.3)

            bft_network.stop_replica(late_replica)

            # create 2 checkpoints and wait for checkpoint propagation
            await skvbc.fill_and_wait_for_checkpoint(
                initial_nodes=bft_network.all_replicas(without={late_replica}),
                num_of_checkpoints_to_add=checkpoints_to_advance_after_first,
                verify_checkpoint_persistency=False
            )

            await bft_network.wait_for_replicas_to_collect_stable_checkpoint(
                bft_network.all_replicas(without={late_replica}),
                first_stable_checkpoint_to_reach + checkpoints_to_advance_after_first)

            bft_network.start_replica(late_replica)
            with trio.fail_after(seconds=30):
            
                late_replica_catch_up = False
                while not late_replica_catch_up:
                    for replica_id in bft_network.get_live_replicas():
                        last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum")
                        last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum")
                        log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable}; lase_exec = {last_exec}")
                        if replica_id == late_replica and last_exec == 2*seq_nums_per_checkpoint:
                            late_replica_catch_up = True

                    await write_req()
                    await trio.sleep(seconds=3)
Example #4
0
    async def test_view_change_with_f_replicas_collected_stable_checkpoint(self, bft_network):
        """
        The goal of this test is to leave the system with F Replicas that have collected a Stable Checkpoint and to
        cause a View Change. In this way we get a misalignment in the Restrictions of the previous View and we get in an
        indefinite View Change scenario.
        1) Start all Replicas.
        2) Move all Replicas to 1 SeqNo prior to the stable Checkpoint.
        3) Stop Replicas 1 and 2.
        4) Isolate Replica 3 from 6, 5 and 4 only in one direction - 3 will be able to send messages to all, but won't
           receive from 6, 5 and 4. this way 3 won't be able to collect a Stable Checkpoint.
           Do the same for 6, isolating in the same manner from 3, 4 and 5
           Do the same for 4, isolating in the same manner from 3, 5 and 6
           This way only 0 and 5 will collect a Stable Checkpoint for SeqNo 150.
        5) With the isolation scenario, send Client Requests until F replicas collect a Stable Checkpoint.
           Only Replicas 0 and 5 will collect.
        6) We stop Replicas 0, 5 and 6 and start 1 and 2. This way we will cause View Change and we will have only 2
           Replicas with a Stable Checkpoint (5 and 0).
        7) Start Replicas 5 and 0. Within this state the system must be able to finalize a View Change,
           because we have (N - 1) live Replicas, but we have only F that have collected a Stable Checkpoint
           that are live.
        """

        # step 1
        bft_network.start_all_replicas()
        skvbc = kvbc.SimpleKVBCProtocol(bft_network)

        num_reqs_before_first_stable = 149

        async def write_req(num_req=1):
            for _ in range(num_req):
                await skvbc.send_write_kv_set()

        await write_req(num_reqs_before_first_stable)

        # step 2
        while True:
            last_exec_seqs = []
            for replica_id in bft_network.all_replicas():
                last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum")
                last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum")
                log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable};\
                                               last_exec = {last_exec}")
                last_exec_seqs.append(last_exec)
            if sum(x == num_reqs_before_first_stable for x in last_exec_seqs) == bft_network.config.n:
                break
            else:
                last_exec_seqs.clear()

        # step 3
        bft_network.stop_replica(1)
        bft_network.stop_replica(2)

        last_stable_seqs = []

        # step 4
        with net.ReplicaOneWayTwoSubsetsIsolatingAdversary(bft_network, {3}, {6, 5, 4}) as adversary:
            adversary.add_rule({6}, {3, 4, 5})
            adversary.add_rule({4}, {3, 5, 6})
            adversary.interfere()

            while True:
                for replica_id in bft_network.get_live_replicas():
                    last_stable = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastStableSeqNum")
                    last_exec = await bft_network.get_metric(replica_id, bft_network, 'Gauges', "lastExecutedSeqNum")
                    log.log_message(message_type=f"replica = {replica_id}; last_stable = {last_stable};\
                                                   lase_exec = {last_exec}")
                    last_stable_seqs.append(last_stable)
                if sum(x == num_reqs_before_first_stable + 1 for x in last_stable_seqs) == bft_network.config.f:
                    # step 5 completed
                    break
                else:
                    last_stable_seqs.clear()
                    await write_req()
                    await trio.sleep(seconds=3)

            # step 6
            bft_network.stop_replica(0)
            bft_network.stop_replica(6)
            bft_network.stop_replica(5)
            bft_network.start_replica(1)
            bft_network.start_replica(2)

            # Send a Client Request to trigger View Change
            with trio.move_on_after(seconds=3):
                await write_req()

            bft_network.start_replica(5)
            bft_network.start_replica(0)

        # Send a Client Request to trigger View Change
        with trio.move_on_after(seconds=3):
            await write_req()

        # step 7
        await bft_network.wait_for_view(
            replica_id=3,
            expected=lambda v: v == 1,
            err_msg="Make sure a view change happens from 0 to 1"
        )

        await skvbc.wait_for_liveness()