def test_add_node_to_pool_with_large_ppseqno_diff_views(
        do_view_change, looper, txnPoolNodeSet, tconf, sdk_pool_handle,
        sdk_wallet_steward, tdir, allPluginsPath):
    """
    Adding a node to the pool while ppSeqNo is big caused a node to stash all the
    requests because of incorrect watermarks limits set.
    The case of view_no == 0 is special.
    The test emulates big ppSeqNo number, adds a node and checks all the pool nodes
    are functional. The test is run with several starting view_no, including 0
    """

    ensure_several_view_change(looper,
                               txnPoolNodeSet,
                               do_view_change,
                               custom_timeout=tconf.NEW_VIEW_TIMEOUT)

    cur_ppseqno = get_pp_seq_no(txnPoolNodeSet)
    big_ppseqno = cur_ppseqno + tconf.LOG_SIZE * 2 + 2300
    assert (big_ppseqno > cur_ppseqno)

    # ensure pool is working properly
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)
    assert (cur_ppseqno < get_pp_seq_no(txnPoolNodeSet))

    _set_ppseqno(txnPoolNodeSet, big_ppseqno)
    cur_ppseqno = get_pp_seq_no(txnPoolNodeSet)
    assert (big_ppseqno == cur_ppseqno)
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)

    assert (cur_ppseqno < get_pp_seq_no(txnPoolNodeSet))

    # Disable view change after adding new node as it will not be able to finish due to fake ppSeqNo set
    for n in txnPoolNodeSet:
        n._on_node_count_changed_committed = lambda: None

    new_steward_name = "testClientSteward" + randomString(4)
    new_node_name = "TestTheta" + randomString(4)
    new_steward_wallet_handle, new_node = sdk_add_new_steward_and_node(
        looper,
        sdk_pool_handle,
        sdk_wallet_steward,
        new_steward_name,
        new_node_name,
        tdir,
        tconf,
        allPluginsPath=allPluginsPath)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))

    sdk_ensure_pool_functional(looper, txnPoolNodeSet,
                               new_steward_wallet_handle, sdk_pool_handle)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])

    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
Ejemplo n.º 2
0
def test_lag_less_then_catchup(looper,
                               txnPoolNodeSet,
                               sdk_pool_handle,
                               sdk_wallet_client):
    delayed_node = txnPoolNodeSet[-1]
    other_nodes = list(set(txnPoolNodeSet) - {delayed_node})
    current_view_no = checkViewNoForNodes(txnPoolNodeSet)
    last_ordered_before = delayed_node.master_replica.last_ordered_3pc
    with delay_rules_without_processing(delayed_node.nodeIbStasher, cDelay()):
        # Send txns for stable checkpoint
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, CHK_FREQ)
        # Check, that all of not slowed nodes has a stable checkpoint
        for n in other_nodes:
            assert n.master_replica._consensus_data.stable_checkpoint == CHK_FREQ

        # Send another txn. This txn will be reordered after view_change
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
        trigger_view_change(txnPoolNodeSet)
        ensureElectionsDone(looper, txnPoolNodeSet)

        assert delayed_node.master_replica.last_ordered_3pc == last_ordered_before

    # Send txns for stabilize checkpoint on other nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, CHK_FREQ - 1)

    pool_pp_seq_no = get_pp_seq_no(other_nodes)
    looper.run(eventually(lambda: assertExp(delayed_node.master_replica.last_ordered_3pc[1] == pool_pp_seq_no)))
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
Ejemplo n.º 3
0
def testNodeDiscardMessageFromUnknownView(
        txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns,
        sdk_new_node_caught_up, allPluginsPath, sdk_wallet_client):
    """
    Node discards 3-phase or ViewChangeDone messages from view nos that it does not
    know of (view nos before it joined the pool)
    :return:
    """
    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_set_with_node_added_after_some_txns
    viewNo = new_node.viewNo

    pp_seq_no = get_pp_seq_no(txnPoolNodeSet)
    # Force two view changes: node discards msgs which have viewNo
    # at least two less than node's. Current protocol implementation
    # needs to hold messages from the previous view as well as
    # from the current view.
    for i in range(1):
        ensure_view_change(looper, txnPoolNodeSet)
        waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
        checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
        pp_seq_no += 1

    sender = txnPoolNodeSet[1]
    rid_x_node = sender.nodestack.getRemote(new_node.name).uid
    messageTimeout = waits.expectedNodeToNodeMessageDeliveryTime()

    # 3 pc msg (PrePrepare) needs to be discarded
    _, did = sdk_wallet_client
    primaryRepl = getPrimaryReplica(txnPoolNodeSet)
    inst_id = 0
    three_pc = create_pre_prepare_no_bls(
        primaryRepl.node.db_manager.get_state_root_hash(DOMAIN_LEDGER_ID),
        viewNo,
        pp_seq_no=pp_seq_no + 1,
        inst_id=inst_id)
    sender.send(three_pc, rid_x_node)
    looper.run(
        eventually(checkDiscardMsg, [
            new_node.replicas[inst_id].stasher,
        ],
                   three_pc,
                   OLD_VIEW,
                   retryWait=1,
                   timeout=messageTimeout))
Ejemplo n.º 4
0
def test_unordered_request_freed_on_replica_removal(looper, txnPoolNodeSet,
                                                    sdk_pool_handle,
                                                    sdk_wallet_client,
                                                    chkFreqPatched,
                                                    view_change):
    node = txnPoolNodeSet[0]
    # Stabilize checkpoint
    # Send one more request to stabilize checkpoint
    sdk_send_random_and_check(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        CHK_FREQ - get_pp_seq_no(txnPoolNodeSet) % CHK_FREQ)
    old_stable_checkpoint = node.master_replica._consensus_data.stable_checkpoint
    stashers = [n.nodeIbStasher for n in txnPoolNodeSet]

    with delay_rules(stashers, cDelay(delay=sys.maxsize),
                     msg_rep_delay(types_to_delay=[COMMIT])):
        req = sdk_send_random_requests(looper, sdk_pool_handle,
                                       sdk_wallet_client, 1)
        looper.runFor(
            waits.expectedPropagateTime(len(txnPoolNodeSet)) +
            waits.expectedPrePrepareTime(len(txnPoolNodeSet)) +
            waits.expectedPrepareTime(len(txnPoolNodeSet)) +
            waits.expectedCommittedTime(len(txnPoolNodeSet)))

        f_d, f_r = get_forwarded_to_all(node)
        assert f_d
        node.replicas.remove_replica(node.replicas.num_replicas - 1)

        assert node.requests[f_d].forwardedTo == node.replicas.num_replicas
        check_for_nodes(txnPoolNodeSet, check_stable_checkpoint,
                        old_stable_checkpoint)

    sdk_get_replies(looper, req)
    check_for_nodes(txnPoolNodeSet, check_stable_checkpoint,
                    old_stable_checkpoint)

    # Send one more request to stabilize checkpoint
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, CHK_FREQ - 1)

    looper.run(
        eventually(check_for_nodes, txnPoolNodeSet, check_stable_checkpoint,
                   old_stable_checkpoint + CHK_FREQ))
    assert len(node.requests) == 0
def test_view_change_gc_in_between_3pc_all_nodes_delays(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):
    """
    Test that garbage collector compares the whole 3PC key (viewNo, ppSeqNo)
    and does not remove messages from node's queues that have higher
    viewNo than last ordered one even if their ppSeqNo are less or equal
    """

    numNodes = len(txnPoolNodeSet)
    viewNo = checkViewNoForNodes(txnPoolNodeSet)

    # 1 send two messages one by one separately to make
    #  node pool working with two batches
    #    -> last_ordered_3pc = (+0, 2) [+0 means from the initial state]
    #       (last_ordered_3pc here and futher is tracked
    #       for master instances only cause non-master ones have
    #       specific logic of its management which we don't care in
    #       the test, see Replica::_setup_for_non_master)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count = get_pp_seq_no(txnPoolNodeSet)
    last_ordered_3pc = (viewNo, batches_count)
    check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc)
    check_nodes_requests_size(txnPoolNodeSet, 2)

    # 2 do view change
    #    -> GC should remove it from nodes' queues
    #    -> viewNo = +1
    ensure_view_change_complete(looper, txnPoolNodeSet)
    batches_count += 1

    viewNo = checkViewNoForNodes(txnPoolNodeSet, viewNo + 1)
    looper.run(
        eventually(check_nodes_last_ordered_3pc, txnPoolNodeSet,
                   (viewNo, batches_count)))
    check_nodes_requests_size(txnPoolNodeSet, 0)

    # 3 slow processing 3PC messages for all nodes (all replica instances)
    #   randomly and send one more message
    #    -> not ordered (last_ordered_3pc still equal (+0, 2)) but primaries
    #       should at least send PRE-PREPAREs
    # TODO could it be not enough for wainting that at least primary
    # has sent PRE-PREPARE
    propagationTimeout = waits.expectedClientRequestPropagationTime(numNodes)
    delay_3pc_messages(txnPoolNodeSet, 0, delay=propagationTimeout * 2)
    delay_3pc_messages(txnPoolNodeSet, 1, delay=propagationTimeout * 2)
    requests = sdk_send_random_request(looper, sdk_pool_handle,
                                       sdk_wallet_client)

    def checkPrePrepareSentAtLeastByPrimary():
        for node in txnPoolNodeSet:
            for replica in node.replicas.values():
                if replica.isPrimary:
                    assert len(replica._ordering_service.sent_preprepares)

    looper.run(
        eventually(checkPrePrepareSentAtLeastByPrimary,
                   retryWait=0.1,
                   timeout=propagationTimeout))
    # 4 do view change
    #    -> GC shouldn't remove anything because
    #       last_ordered_3pc (+0, 1) < last message's 3pc key (+1, 1)
    #    -> viewNo = 2
    ensure_view_change_complete(looper, txnPoolNodeSet)
    batches_count += 1

    viewNoNew = checkViewNoForNodes(txnPoolNodeSet)
    # another view change could happen because of slow nodes
    assert viewNoNew - viewNo in (1, 2)
    viewNo = viewNoNew
    check_nodes_last_ordered_3pc(txnPoolNodeSet,
                                 (last_ordered_3pc[0] + 1, batches_count - 1))
    check_nodes_requests_size(txnPoolNodeSet, 1)

    # 5 reset delays and wait for replies
    #    -> new primaries should send new 3pc for last message
    #       with 3pc key (+2, 1)
    #    -> they should be ordered
    #    -> last_ordered_3pc = (+2, 1)
    reset_delays_and_process_delayeds(txnPoolNodeSet)
    sdk_get_replies(looper, [requests])
    batches_count += 1

    checkViewNoForNodes(txnPoolNodeSet, viewNo)
    last_ordered_3pc = (viewNo, batches_count)
    check_nodes_last_ordered_3pc(txnPoolNodeSet, last_ordered_3pc)
    check_nodes_requests_size(txnPoolNodeSet, 1)

    # 6 do view change
    #    -> GC should remove them
    ensure_view_change_complete(looper, txnPoolNodeSet)
    batches_count += 1

    viewNo = checkViewNoForNodes(txnPoolNodeSet, viewNo + 1)
    check_nodes_last_ordered_3pc(txnPoolNodeSet,
                                 (last_ordered_3pc[0] + 1, batches_count))
    check_nodes_requests_size(txnPoolNodeSet, 0)
def test_unstash_three_phase_msg_after_catchup(txnPoolNodeSet, looper, tconf,
                                               sdk_pool_handle,
                                               sdk_wallet_steward):
    """
    1. Delay Commit on Node4
    2. Order 1 req
    3. Delay Commit on all nodes
    4. Order 1 req
    5. Delay CatchupRep on Node4
    6. Delay Ledger Status and ViewChangeDones on Nodes1-3
    7. Start View change on all nodes
    8. Wait until Node4 got 3 stashed CatchupReps
    9. Reset delaying of Commits on all Nodes
    10. Reset Ledger Status on Nodes1-3
    11. Check that 3 nodes finished VC while Node4 is syncing and not finished
    12. Reset CatchupRep on Node4
    13. Check that Node4 finished VC, and there was just 1 round of catch-up
    """
    slow_node = txnPoolNodeSet[-1]
    fast_nodes = txnPoolNodeSet[:-1]
    view_no = txnPoolNodeSet[0].viewNo
    old_stashed = slow_node.master_replica.stasher.stash_size(STASH_VIEW_3PC)
    last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc
    batches_count = last_ordered[1]

    with delay_rules(
        [n.nodeIbStasher for n in txnPoolNodeSet],
            msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])):

        # Delay Commit messages for slow_node.
        slow_node.nodeIbStasher.delay(cDelay(sys.maxsize))
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)
        batches_count += 1

        # Delay Commit messages for fast_nodes.
        for n in fast_nodes:
            n.nodeIbStasher.delay(cDelay(sys.maxsize))

        request2 = sdk_send_random_request(looper, sdk_pool_handle,
                                           sdk_wallet_steward)
        batches_count += 1

        def check_commits(commit_key):
            for n in fast_nodes:
                for r in n.replicas.values():
                    assert commit_key in r._ordering_service.commits
                    assert len(
                        r._ordering_service.commits[commit_key].voters) == 1

        looper.run(
            eventually(check_commits,
                       (view_no, last_ordered[1] + batches_count)))

        # Delay CatchupRep messages for the slow_node.
        with delay_rules([slow_node.nodeIbStasher], cr_delay()):
            with delay_rules([n.nodeIbStasher for n in fast_nodes],
                             msg_rep_delay(types_to_delay=[LEDGER_STATUS])):

                for n in txnPoolNodeSet:
                    n.start_catchup()
                looper.run(
                    eventually(
                        lambda: assertExp(slow_node.mode == Mode.discovering)))

                # Reset delay Commit messages for all nodes.
                for n in txnPoolNodeSet:
                    n.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT)

                assert slow_node.mode == Mode.discovering
                looper.run(
                    eventually(_check_nodes_stashed, fast_nodes, old_stashed,
                               len(txnPoolNodeSet) - 1))
                looper.run(
                    eventually(_check_nodes_stashed, [slow_node], old_stashed,
                               (len(txnPoolNodeSet) - 1) * 2))

        sdk_get_and_check_replies(looper, [request2])
        _check_nodes_stashed(fast_nodes, old_stashed, 0)
        assert get_pp_seq_no(txnPoolNodeSet) == batches_count

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)
Ejemplo n.º 7
0
def test_stashed_messages_processed_on_backup_replica_ordering_resumption(
        looper, chkFreqPatched, reqs_for_checkpoint,
        one_replica_and_others_in_backup_instance,
        sdk_pool_handle, sdk_wallet_client, view_change_done,
        txnPoolNodeSet):
    """
    Verifies resumption of ordering 3PC-batches on a backup replica
    on detection of a lag in checkpoints in case it is detected after
    some 3PC-messages related to the next checkpoint have already been stashed
    as laying outside of the watermarks.
    Please note that to verify this case the config is set up so that
    LOG_SIZE == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ
    """
    global first_run
    batches_count = get_pp_seq_no(txnPoolNodeSet)

    slow_replica, other_replicas = one_replica_and_others_in_backup_instance
    view_no = slow_replica.viewNo
    low_watermark = slow_replica.h

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1

    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Don't receive Commits from two replicas
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[0].node.name))
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[1].node.name))
    slow_replica.node.nodeIbStasher.delay(
        msg_rep_delay(types_to_delay=[COMMIT])
    )

    # Send a request for which the replica will not be able to order the batch
    # due to an insufficient count of Commits
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Receive further Commits from now on
    slow_replica.node.nodeIbStasher.drop_delayeds()
    slow_replica.node.nodeIbStasher.resetDelays()

    # Send requests but in a quantity insufficient
    # for catch-up number of checkpoints
    reqs_until_checkpoints = reqs_for_checkpoint - get_pp_seq_no([r.node for r in other_replicas]) % reqs_for_checkpoint
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client,
                             Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                             reqs_until_checkpoints)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Don't receive Checkpoints
    slow_replica.node.nodeIbStasher.delay(chk_delay(instId=1))

    # Send more requests to reach catch-up number of checkpoints
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client,
                             reqs_for_checkpoint)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Ensure that there are no 3PC-messages stashed
    # as laying outside of the watermarks
    assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == 0

    # Send a request for which the batch will be outside of the watermarks
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Ensure that the replica has not ordered any batches
    # after the very first one
    assert slow_replica.last_ordered_3pc == (view_no, batches_count)

    # Ensure that the watermarks have not been shifted since the view start
    assert slow_replica.h == low_watermark
    assert slow_replica.H == (sys.maxsize if first_run else low_watermark + LOG_SIZE)

    # Ensure that there are some quorumed stashed checkpoints
    check_num_quorumed_received_checkpoints(slow_replica, 1)

    # Ensure that now there are 3PC-messages stashed
    # as laying outside of the watermarks
    if not first_run:
        assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == incoming_3pc_msgs_count(len(txnPoolNodeSet))

    # Receive belated Checkpoints
    slow_replica.node.nodeIbStasher.reset_delays_and_process_delayeds()
    batches_count = get_pp_seq_no([r.node for r in other_replicas])

    # Ensure that the replica has ordered the batch for the last sent request
    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc ==
                                     (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Ensure that the watermarks have been shifted so that the lower watermark
    # now equals to the end of the last stable checkpoint in the instance
    assert slow_replica.h == low_watermark + (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ
    assert slow_replica.H == low_watermark + (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE

    # Ensure that now there are no quorumed stashed checkpoints
    check_num_quorumed_received_checkpoints(slow_replica, 0)

    # Ensure that now there are no 3PC-messages stashed
    # as laying outside of the watermarks
    assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == 0

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1

    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc ==
                                           (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))
    first_run = False
def test_backup_primary_restores_pp_seq_no_if_view_is_same(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf,
        tdir, allPluginsPath, chkFreqPatched, view_no):
    # Get a node with a backup primary replica
    replica = getPrimaryReplica(txnPoolNodeSet, instId=backup_inst_id)
    batches_count = get_pp_seq_no(txnPoolNodeSet)
    node = replica.node
    # Send some 3PC-batches and wait until the replica orders the 3PC-batches
    sdk_send_batches_of_random(looper,
                               txnPoolNodeSet,
                               sdk_pool_handle,
                               sdk_wallet_client,
                               num_reqs=7,
                               num_batches=num_batches,
                               timeout=tconf.Max3PCBatchWait)
    batches_count += num_batches

    looper.run(
        eventually(lambda r: assertExp(r.last_ordered_3pc ==
                                       (view_no, batches_count)),
                   replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Check view no of the node and lastPrePrepareSeqNo of the replica
    assert node.viewNo == view_no
    assert replica.lastPrePrepareSeqNo == batches_count

    # Ensure that the node has stored the last sent PrePrepare key
    assert LAST_SENT_PRE_PREPARE in node.nodeStatusDB
    last_sent_pre_prepare_key = \
        node_status_db_serializer.deserialize(
            node.nodeStatusDB.get(LAST_SENT_PRE_PREPARE))
    assert last_sent_pre_prepare_key == {
        str(backup_inst_id): [view_no, batches_count]
    }

    # Restart the node containing the replica
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            node.name,
                                            stopNode=True)
    looper.removeProdable(node)
    txnPoolNodeSet.remove(node)

    node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath)
    txnPoolNodeSet.append(node)

    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper, txnPoolNodeSet)

    replica = node.replicas[backup_inst_id]

    # Verify that after the successful propagate primary procedure the replica
    # (which must still be the primary in its instance) has restored
    # lastPrePrepareSeqNo and adjusted last_ordered_3pc and shifted
    # the watermarks correspondingly
    assert node.viewNo == view_no
    assert replica.isPrimary
    assert replica.lastPrePrepareSeqNo == batches_count
    assert replica.last_ordered_3pc == (view_no, batches_count)
    assert replica.h == batches_count
    assert replica.H == batches_count + LOG_SIZE

    # Verify also that the stored last sent PrePrepare key has not been erased
    assert LAST_SENT_PRE_PREPARE in node.nodeStatusDB

    # Send a 3PC-batch and ensure that the replica orders it
    sdk_send_batches_of_random(looper,
                               txnPoolNodeSet,
                               sdk_pool_handle,
                               sdk_wallet_client,
                               num_reqs=1,
                               num_batches=1,
                               timeout=tconf.Max3PCBatchWait)
    batches_count += 1
    looper.run(
        eventually(lambda: assertExp(replica.last_ordered_3pc ==
                                     (view_no, batches_count)),
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))
Ejemplo n.º 9
0
def test_backup_replica_resumes_ordering_on_lag_in_checkpoints(
        looper, chkFreqPatched, reqs_for_checkpoint,
        one_replica_and_others_in_backup_instance, sdk_pool_handle,
        sdk_wallet_client, view_change_done, txnPoolNodeSet):
    """
    Verifies resumption of ordering 3PC-batches on a backup replica
    on detection of a lag in checkpoints
    """
    slow_replica, other_replicas = one_replica_and_others_in_backup_instance
    view_no = slow_replica.viewNo
    batches_count = slow_replica.last_ordered_3pc[1]

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1
    low_watermark = slow_replica.h

    looper.run(
        eventually(lambda: assert_eq(slow_replica.last_ordered_3pc,
                                     (view_no, batches_count)),
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Don't receive Commits from two replicas
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[0].node.name))
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[1].node.name))

    # Send a request for which the replica will not be able to order the batch
    # due to an insufficient count of Commits
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Recover reception of Commits
    slow_replica.node.nodeIbStasher.drop_delayeds()
    slow_replica.node.nodeIbStasher.resetDelays()

    # Send requests but in a quantity insufficient
    # for catch-up number of checkpoints
    reqs_until_checkpoints = reqs_for_checkpoint - other_replicas[
        0].last_ordered_3pc[1]
    sdk_send_random_requests(
        looper, sdk_pool_handle, sdk_wallet_client,
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_until_checkpoints)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Ensure that the replica has not ordered any batches
    # after the very first one
    assert slow_replica.last_ordered_3pc == (view_no, batches_count)

    # Ensure that the watermarks have not been shifted since the view start
    assert slow_replica.h == low_watermark
    assert slow_replica.H == low_watermark + LOG_SIZE

    # Ensure that the collections related to requests, batches and
    # own checkpoints are not empty.
    # (Note that a primary replica removes requests from requestQueues
    # when creating a batch with them.)
    if slow_replica.isPrimary:
        assert slow_replica._ordering_service.sent_preprepares
    else:
        assert slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID]
        assert slow_replica._ordering_service.prePrepares
    assert slow_replica._ordering_service.prepares
    assert slow_replica._ordering_service.commits
    assert slow_replica._ordering_service.batches

    check_num_unstable_checkpoints(slow_replica, 0)
    check_num_quorumed_received_checkpoints(slow_replica, 1)

    # Send more requests to reach catch-up number of checkpoints
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, reqs_for_checkpoint)
    batches_count += 1
    batches_count += reqs_until_checkpoints
    batches_count += reqs_for_checkpoint
    # Ensure that the replica has adjusted last_ordered_3pc to the end
    # of the last checkpoint
    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == \
                        (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Ensure that the watermarks have been shifted so that the lower watermark
    # has the same value as last_ordered_3pc
    assert slow_replica.h == low_watermark + (
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ
    assert slow_replica.H == low_watermark + (
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE

    # Ensure that the collections related to requests, batches and
    # own checkpoints have been cleared
    assert not slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID]
    assert not slow_replica._ordering_service.sent_preprepares
    assert not slow_replica._ordering_service.prePrepares
    assert not slow_replica._ordering_service.prepares
    assert not slow_replica._ordering_service.commits
    assert not slow_replica._ordering_service.batches

    check_num_unstable_checkpoints(slow_replica, 0)
    check_num_quorumed_received_checkpoints(slow_replica, 0)

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1

    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc ==
                                           (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))
    slow_replica._checkpointer._received_checkpoints.clear()
    batches_count = get_pp_seq_no(txnPoolNodeSet)
Ejemplo n.º 10
0
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched,
                                          reqs_for_checkpoint, testNodeClass,
                                          txnPoolNodeSet, sdk_pool_handle,
                                          sdk_wallet_client, allPluginsPath):
    '''
    Tests that 3PC messages and Checkpoints being ordered during catch-up are stashed and re-applied
    when catch-up is finished.
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received.
    '''

    batches_count = get_pp_seq_no(txnPoolNodeSet)
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher,
                     cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got catchup replies for messages missed while the node was offline,
        # so that now qwe can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3),
                       retryWait=1,
                       timeout=60))

        assert lagging_node.mode == Mode.syncing

        # make sure that more requests are being ordered while catch-up is in progress
        # stash enough stable checkpoints for starting a catch-up
        num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
        num_reqs = reqs_for_checkpoint * num_checkpoints + 1
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, num_reqs)
        batches_count += num_reqs
        looper.run(
            eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                       (0, batches_count)))

        # all good nodes stabilized checkpoint
        looper.run(
            eventually(check_for_nodes, rest_nodes, check_stable_checkpoint,
                       10))

        # lagging node is catching up and stashing all checkpoints
        assert lagging_node.mode == Mode.syncing
        looper.run(
            eventually(lambda: assertExp(
                get_stashed_checkpoints(lagging_node) == num_checkpoints * len(
                    rest_nodes)),
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

    # check that last_ordered is set
    looper.run(
        eventually(check_last_ordered_3pc_on_all_replicas, [lagging_node],
                   (0, batches_count)))

    # check that checkpoint is stabilized for master
    looper.run(
        eventually(check_for_instance, [lagging_node], 0,
                   check_stable_checkpoint, 10))

    # check that the catch-up is finished
    looper.run(
        eventually(lambda: assertExp(lagging_node.mode == Mode.participating),
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    # check that catch-up was started only once
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
            initial_all_ledgers_caught_up + 1)))
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.start_catchup) == 1)))

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_backup_replica_resumes_ordering_on_lag_if_checkpoints_belate(
        looper, chkFreqPatched, reqs_for_checkpoint,
        one_replica_and_others_in_backup_instance, sdk_pool_handle,
        sdk_wallet_client, view_change_done, txnPoolNodeSet):
    """
    Verifies resumption of ordering 3PC-batches on a backup replica
    on detection of a lag in checkpoints in case it is detected after
    some batch in the next checkpoint has already been committed but cannot
    be ordered out of turn
    """
    def check_last_ordered(replica, lo):
        assert replica.last_ordered_3pc == lo

    slow_replica, other_replicas = one_replica_and_others_in_backup_instance
    view_no = slow_replica.viewNo
    check_last_ordered_3pc_backup(slow_replica.node, other_replicas[0].node)
    batches_count = slow_replica.last_ordered_3pc[1]
    low_watermark = slow_replica.h

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1

    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc ==
                                           (view_no, batches_count)),
                   slow_replica,
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Don't receive Commits from two replicas
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[0].node.name))
    slow_replica.node.nodeIbStasher.delay(
        cDelay(instId=1, sender_filter=other_replicas[1].node.name))
    slow_replica.node.nodeIbStasher.delay(
        msg_rep_delay(types_to_delay=[COMMIT]))

    # Send a request for which the replica will not be able to order the batch
    # due to an insufficient count of Commits
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Receive further Commits from now on
    slow_replica.node.nodeIbStasher.drop_delayeds()
    slow_replica.node.nodeIbStasher.resetDelays()
    looper.run(
        eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc ==
                                           (view_no, batches_count)),
                   slow_replica,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Send requests but in a quantity insufficient
    # for catch-up number of checkpoints
    reqs_until_checkpoints = reqs_for_checkpoint - get_pp_seq_no(
        [r.node for r in other_replicas]) % reqs_for_checkpoint
    sdk_send_random_requests(
        looper, sdk_pool_handle, sdk_wallet_client,
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_until_checkpoints)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Don't receive Checkpoints
    slow_replica.node.nodeIbStasher.delay(chk_delay(instId=1))

    # Send more requests to reach catch-up number of checkpoints
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client,
                             reqs_for_checkpoint)
    # Send a request that starts a new checkpoint
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    looper.runFor(waits.expectedTransactionExecutionTime(nodeCount))

    # Ensure that the replica has not ordered any batches
    # after the very first one
    assert slow_replica.last_ordered_3pc == (view_no, batches_count)

    # Ensure that the watermarks have not been shifted since the view start
    assert slow_replica.h == low_watermark
    assert slow_replica.H == low_watermark + LOG_SIZE

    # Ensure that there are some quorumed stashed checkpoints
    check_num_quorumed_received_checkpoints(slow_replica, 1)

    # Receive belated Checkpoints
    slow_replica.node.nodeIbStasher.reset_delays_and_process_delayeds()

    batches_count += 1
    batches_count += reqs_until_checkpoints
    batches_count += reqs_for_checkpoint
    batches_count += 1
    # Ensure that the replica has ordered the batch for the last sent request
    looper.run(
        eventually(check_last_ordered,
                   slow_replica, (view_no, batches_count),
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))

    # Ensure that the watermarks have been shifted so that the lower watermark
    # now equals to the end of the last stable checkpoint in the instance
    assert slow_replica.h == low_watermark + (
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ
    assert slow_replica.H == low_watermark + (
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE

    # Ensure that now there are no quorumed stashed checkpoints
    check_num_quorumed_received_checkpoints(slow_replica, 0)

    # Send a request and ensure that the replica orders the batch for it
    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1)
    batches_count += 1

    looper.run(
        eventually(lambda: assertExp(slow_replica.last_ordered_3pc ==
                                     (view_no, batches_count)),
                   retryWait=1,
                   timeout=waits.expectedTransactionExecutionTime(nodeCount)))