Example #1
0
def test_node_load_after_add_then_disconnect(sdk_new_node_caught_up, txnPoolNodeSet,
                                             tconf, looper, sdk_pool_handle,
                                             sdk_wallet_client,
                                             tdirWithPoolTxns, allPluginsPath,
                                             capsys):
    """
    A node that restarts after some transactions should eventually get the
    transactions which happened while it was down
    :return:
    """
    new_node = sdk_new_node_caught_up
    with capsys.disabled():
        print("Stopping node {} with pool ledger size {}".
              format(new_node, new_node.poolManager.txnSeqNo))
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node)
    looper.removeProdable(new_node)

    client_batches = 80
    txns_per_batch = 10
    for i in range(client_batches):
        s = perf_counter()
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, txns_per_batch)
        with capsys.disabled():
            print('{} executed {} client txns in {:.2f} seconds'.
                  format(i + 1, txns_per_batch, perf_counter() - s))

    with capsys.disabled():
        print("Starting the stopped node, {}".format(new_node))
    nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha)
    new_node = TestNode(
        new_node.name,
        basedirpath=tdirWithPoolTxns,
        base_data_dir=tdirWithPoolTxns,
        config=tconf,
        ha=nodeHa,
        cliha=nodeCHa,
        pluginPaths=allPluginsPath)
    looper.add(new_node)
    txnPoolNodeSet[-1] = new_node

    # Delay catchup reply processing so LedgerState does not change
    delay_catchup_reply = 5
    new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply))
    looper.run(checkNodesConnected(txnPoolNodeSet))

    # Make sure ledger starts syncing (sufficient consistency proofs received)
    looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID,
                          LedgerState.syncing, retryWait=.5, timeout=5))

    # Not accurate timeout but a conservative one
    timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \
              2 * delay_catchup_reply
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4],
                         customTimeout=timeout)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 5)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def test_view_change_during_unstash(looper, txnPoolNodeSet, sdk_pool_handle,
                                    sdk_wallet_client, tconf):
    slow_node = txnPoolNodeSet[-1]
    other_nodes = txnPoolNodeSet[:-1]

    slow_stasher = slow_node.nodeIbStasher
    other_stashers = [n.nodeIbStasher for n in other_nodes]
    all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet]

    # Preload nodes with some transactions
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    for node in txnPoolNodeSet:
        assert node.master_replica.last_ordered_3pc == (0, 1)

    # Prevent ordering of some requests
    start_delaying(all_stashers, delay_3pc(after=7, msgs=(Prepare, Commit)))

    # Stop ordering on slow node and send requests
    slow_node_after_5 = start_delaying(slow_stasher,
                                       delay_3pc(after=5, msgs=Commit))
    slow_node_until_5 = start_delaying(slow_stasher, delay_3pc(after=0))
    reqs_view_0 = sdk_send_random_requests(looper, sdk_pool_handle,
                                           sdk_wallet_client, 8)

    # Make pool order first 2 batches and pause
    pool_after_3 = start_delaying(other_stashers, delay_3pc(after=3))
    looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 3))

    # Start catchup, continue ordering everywhere (except two last batches on slow node)
    with delay_rules(slow_stasher, cr_delay()):
        slow_node._do_start_catchup(just_started=False)
        looper.run(eventually(check_catchup_is_started, slow_node))
        stop_delaying_and_process(pool_after_3)
        looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 7))

    # Finish catchup and continue processing on slow node
    looper.run(eventually(check_catchup_is_finished, slow_node))
    stop_delaying_and_process(slow_node_until_5)
    looper.run(eventually(check_nodes_ordered_till, [slow_node], 0, 5))

    # Start view change and allow slow node to get remaining commits
    with delay_rules(all_stashers, icDelay()):
        for node in txnPoolNodeSet:
            node.view_changer.on_master_degradation()
        looper.runFor(0.1)
    stop_delaying_and_process(slow_node_after_5)

    # Ensure that expected number of requests was ordered
    replies = sdk_get_replies(looper, reqs_view_0)
    for rep in replies[:6]:
        sdk_check_reply(rep)

    # Ensure that everything is ok
    ensureElectionsDone(looper, txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client,
                               sdk_pool_handle)
def setup(request, looper, txnPoolNodeSet):
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    # Delay catchup reply so that the test gets time to make the check,
    # this delay is reset after the check
    slow_node.nodeIbStasher.delay(cr_delay(100))
    slow_node.nodeIbStasher.delay(pDelay(100, 0))
    slow_node.nodeIbStasher.delay(cDelay(100, 0))
    if request.param == 'all':
        slow_node.nodeIbStasher.delay(ppDelay(100, 0))
    return slow_node, fast_nodes
def setup(request, looper, txnPoolNodeSet):
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    # Delay catchup reply so that the test gets time to make the check,
    # this delay is reset after the check
    slow_node.nodeIbStasher.delay(cr_delay(100))
    slow_node.nodeIbStasher.delay(pDelay(100, 0))
    slow_node.nodeIbStasher.delay(cDelay(100, 0))
    if request.param == 'all':
        slow_node.nodeIbStasher.delay(ppDelay(100, 0))
    return slow_node, fast_nodes
Example #5
0
def scenario_txns_during_catchup(
        looper, tconf, tdir, allPluginsPath, do_post_node_creation,
        nodes,
        send_txns
):
    lagging_node = nodes[-1]
    rest_nodes = nodes[:-1]

    # Stop NodeX
    lagging_node.cleanupOnStopping = False
    disconnect_node_and_ensure_disconnected(looper,
                                            nodes,
                                            lagging_node.name,
                                            stopNode=True)
    looper.removeProdable(name=lagging_node.name)

    # Send transactions
    send_txns()
    ensure_all_nodes_have_same_data(looper, rest_nodes)

    # Start NodeX
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )
    do_post_node_creation(lagging_node)
    HelperNode.fill_auth_map_for_node(lagging_node, XFER_PUBLIC)
    HelperNode.fill_auth_map_for_node(lagging_node, NYM)
    nodes[-1] = lagging_node

    # Delay CathupRep for DOMAIN ledger for NodeX
    with delay_rules(
        lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)
    ):
        # allow started node to receive looper events
        looper.add(lagging_node)
        # ensure it connected to others
        looper.run(checkNodesConnected(nodes))
        # Send transactions
        send_txns()
        ensure_all_nodes_have_same_data(looper, rest_nodes)

    # Reset delays
    # Make sure that all nodes have equal state
    ensure_all_nodes_have_same_data(looper, nodes)

    # Send transactions
    send_txns()
    ensure_all_nodes_have_same_data(looper, rest_nodes)
def test_freshness_in_catchup(looper,
                              txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_steward,
                              tconf, ):
    """
    Check that InstanceChange with reason "State signatures are not updated for too long"
    are not sends in catchup without view change.
    """
    view_no = txnPoolNodeSet[0].viewNo

    lagging_node = txnPoolNodeSet[-1]

    with delay_rules(lagging_node.nodeIbStasher, cr_delay(), cDelay()):
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)
        lagging_node.start_catchup()
        looper.runFor(tconf.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * tconf.STATE_FRESHNESS_UPDATE_INTERVAL + 5)

    print(lagging_node.view_changer.instance_changes._cache)
    assert not lagging_node.view_changer.instance_changes.has_inst_chng_from(view_no + 1, lagging_node.name)
def test_3pc_while_catchup_with_chkpoints(tdir, tconf,
                                          looper,
                                          chkFreqPatched,
                                          reqs_for_checkpoint,
                                          testNodeClass,
                                          txnPoolNodeSet,
                                          sdk_pool_handle,
                                          sdk_wallet_client,
                                          allPluginsPath):
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(lagging_node,
                                      looper,
                                      tconf,
                                      tdir,
                                      allPluginsPath,
                                      start=False,
                                      )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp)
    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher, cr_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got catchup replies for messages missed while the node was offline,
            # so that now qwe can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3), retryWait=1,
                           timeout=60))

            assert lagging_node.mode == Mode.syncing

            # make sure that more requests are being ordered while catch-up is in progress
            # stash enough stable checkpoints for starting a catch-up
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client,
                                      reqs_for_checkpoint * (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1))

            assert lagging_node.mode == Mode.syncing

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))
            )
        )
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1)
            )
        )
        # check that catch-up was started only once
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.start_catchup) == 1)
            )
        )
        # check that the node was able to order requests stashed during catch-up
        waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_slow_node_reverts_unordered_state_during_catchup(
        looper, txnPoolNodeSet, client1, wallet1, client1Connected):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 15

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_twice(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                              delay_batches * Max3PCBatchSize)

    def is_catchup_needed_count():
        return len(
            getAllReturnVals(slow_node,
                             slow_node.is_catchup_needed,
                             compare_val_to=True))

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)
    old_cn_count = is_catchup_needed_count()

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        lst_3pc = check_last_ordered_3pc(n1, n2)

    def chk1():
        # `slow_node` has prepared all 3PC messages which
        # `other_nodes` have ordered
        assertEquality(slow_master_replica.last_prepared_before_view_change,
                       lst_3pc)

    looper.run(eventually(chk1, retryWait=1))

    old_pc_count = slow_master_replica.spylog.count(
        slow_master_replica.can_process_since_view_change_in_progress)

    # Repair the network so COMMITs are received and processed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica.spylog.count(
            slow_master_replica.can_process_since_view_change_in_progress
        ) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # Some COMMITs were ordered but stashed and they were processed
        rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
        assert delay_batches in rv

    looper.run(eventually(chk3, retryWait=1, timeout=catchup_rep_delay + 5))

    def chk4():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(chk4,
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    def chk5():
        # Once catchup was done, need of other catchup was not found
        assertEquality(is_catchup_needed_count(), old_cn_count)

    looper.run(eventually(chk5, retryWait=1, timeout=5))

    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_3pc_while_catchup_with_chkpoints_only(tdir, tconf,
                                               looper,
                                               chkFreqPatched,
                                               reqs_for_checkpoint,
                                               testNodeClass,
                                               txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               allPluginsPath):
    '''
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received during catch-up.
    Assume that only checkpoints and no 3PC messages are received.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(lagging_node,
                                      looper,
                                      tconf,
                                      tdir,
                                      allPluginsPath,
                                      start=False,
                                      )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp)

    # delay all 3PC messages on the lagged node so that it
    # receives only Checkpoints and catch-up messages

    lagging_node.nodeIbStasher.delay(ppDelay())
    lagging_node.nodeIbStasher.delay(pDelay())
    lagging_node.nodeIbStasher.delay(cDelay())

    with delay_rules(lagging_node.nodeIbStasher, lsDelay(), cr_delay()):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got ledger statuses for messages missed while the node was offline,
        # so that now we can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >= 3), retryWait=1,
                       timeout=60))

        assert lagging_node.mode != Mode.participating

        # make sure that more requests are being ordered while catch-up is in progress
        # stash enough stable checkpoints for starting a catch-up
        num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
        num_reqs = reqs_for_checkpoint * num_checkpoints + 1
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client,
                                  num_reqs)
        looper.run(
            eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                       (0, num_reqs + 2))
        )

        # all good nodes stabilized checkpoint
        looper.run(eventually(chkChkpoints, rest_nodes, 2, 0))

        assert lagging_node.mode != Mode.participating
        # lagging node is catching up and stashing all checkpoints
        looper.run(
            eventually(
                lambda: assertExp(get_stashed_checkpoints(lagging_node) == num_checkpoints * len(rest_nodes)),
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))
            )
        )

    # check that last_ordered is set
    looper.run(
        eventually(check_last_ordered_3pc_on_master, [lagging_node],
                   (0, num_reqs + 2))
    )

    # check that the catch-up is finished
    looper.run(
        eventually(
            lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1,
            timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))
        )
    )

    # check that catch-up was started twice, since we were able to catch-up till audit ledger only
    # for the first time, and after this the node sees a quorum of stashed checkpoints
    assert lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1
    assert lagging_node.spylog.count(Node.start_catchup) == 1

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_unstash_three_phase_msg_after_catchup(txnPoolNodeSet, looper, tconf,
                                               sdk_pool_handle,
                                               sdk_wallet_steward):
    """
    1. Delay Commit on Node4
    2. Order 1 req
    3. Delay Commit on all nodes
    4. Order 1 req
    5. Delay CatchupRep on Node4
    6. Delay Ledger Status and ViewChangeDones on Nodes1-3
    7. Start View change on all nodes
    8. Wait until Node4 got 3 stashed CatchupReps
    9. Reset delaying of Commits on all Nodes
    10. Reset Ledger Status on Nodes1-3
    11. Check that 3 nodes finished VC while Node4 is syncing and not finished
    12. Reset CatchupRep on Node4
    13. Check that Node4 finished VC, and there was just 1 round of catch-up
    """
    slow_node = txnPoolNodeSet[-1]
    fast_nodes = txnPoolNodeSet[:-1]
    view_no = txnPoolNodeSet[0].viewNo
    old_stashed = slow_node.master_replica.stasher.stash_size(STASH_VIEW_3PC)
    last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc
    batches_count = last_ordered[1]

    with delay_rules(
        [n.nodeIbStasher for n in txnPoolNodeSet],
            msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])):

        # Delay Commit messages for slow_node.
        slow_node.nodeIbStasher.delay(cDelay(sys.maxsize))
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)
        batches_count += 1

        # Delay Commit messages for fast_nodes.
        for n in fast_nodes:
            n.nodeIbStasher.delay(cDelay(sys.maxsize))

        request2 = sdk_send_random_request(looper, sdk_pool_handle,
                                           sdk_wallet_steward)
        batches_count += 1

        def check_commits(commit_key):
            for n in fast_nodes:
                for r in n.replicas.values():
                    assert commit_key in r._ordering_service.commits
                    assert len(
                        r._ordering_service.commits[commit_key].voters) == 1

        looper.run(
            eventually(check_commits,
                       (view_no, last_ordered[1] + batches_count)))

        # Delay CatchupRep messages for the slow_node.
        with delay_rules([slow_node.nodeIbStasher], cr_delay()):
            with delay_rules([n.nodeIbStasher for n in fast_nodes],
                             msg_rep_delay(types_to_delay=[LEDGER_STATUS])):

                for n in txnPoolNodeSet:
                    n.start_catchup()
                looper.run(
                    eventually(
                        lambda: assertExp(slow_node.mode == Mode.discovering)))

                # Reset delay Commit messages for all nodes.
                for n in txnPoolNodeSet:
                    n.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT)

                assert slow_node.mode == Mode.discovering
                looper.run(
                    eventually(_check_nodes_stashed, fast_nodes, old_stashed,
                               len(txnPoolNodeSet) - 1))
                looper.run(
                    eventually(_check_nodes_stashed, [slow_node], old_stashed,
                               (len(txnPoolNodeSet) - 1) * 2))

        sdk_get_and_check_replies(looper, [request2])
        _check_nodes_stashed(fast_nodes, old_stashed, 0)
        assert get_pp_seq_no(txnPoolNodeSet) == batches_count

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)
Example #11
0
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))

        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = \
            repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(
                repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) -
                process_checkpoint_times_before ==
                (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))

        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.discovering

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
Example #12
0
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched,
                                          reqs_for_checkpoint, testNodeClass,
                                          txnPoolNodeSet, sdk_pool_handle,
                                          sdk_wallet_client, allPluginsPath):
    '''
    Tests that 3PC messages and Checkpoints being ordered during catch-up are stashed and re-applied
    when catch-up is finished.
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received.
    '''

    batches_count = get_pp_seq_no(txnPoolNodeSet)
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher,
                     cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got catchup replies for messages missed while the node was offline,
        # so that now qwe can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3),
                       retryWait=1,
                       timeout=60))

        assert lagging_node.mode == Mode.syncing

        # make sure that more requests are being ordered while catch-up is in progress
        # stash enough stable checkpoints for starting a catch-up
        num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
        num_reqs = reqs_for_checkpoint * num_checkpoints + 1
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, num_reqs)
        batches_count += num_reqs
        looper.run(
            eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                       (0, batches_count)))

        # all good nodes stabilized checkpoint
        looper.run(
            eventually(check_for_nodes, rest_nodes, check_stable_checkpoint,
                       10))

        # lagging node is catching up and stashing all checkpoints
        assert lagging_node.mode == Mode.syncing
        looper.run(
            eventually(lambda: assertExp(
                get_stashed_checkpoints(lagging_node) == num_checkpoints * len(
                    rest_nodes)),
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

    # check that last_ordered is set
    looper.run(
        eventually(check_last_ordered_3pc_on_all_replicas, [lagging_node],
                   (0, batches_count)))

    # check that checkpoint is stabilized for master
    looper.run(
        eventually(check_for_instance, [lagging_node], 0,
                   check_stable_checkpoint, 10))

    # check that the catch-up is finished
    looper.run(
        eventually(lambda: assertExp(lagging_node.mode == Mode.participating),
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    # check that catch-up was started only once
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
            initial_all_ledgers_caught_up + 1)))
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.start_catchup) == 1)))

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet,
                           sdk_pool_handle, sdk_wallet_client, allPluginsPath):
    '''
    Tests that requests being ordered during catch-up are stashed and re-applied
    when catch-up is finished
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)
    assert all(replica.stasher.num_stashed_catchup == 0
               for inst_id, replica in lagging_node.replicas.items())

    with delay_rules(lagging_node.nodeIbStasher,
                     cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got catchup replies for messages missed while the node was offline,
        # so that now we can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3),
                       retryWait=1,
                       timeout=60))

        # make sure that more requests are being ordered while catch-up is in progress
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, 10)

        assert lagging_node.mode == Mode.syncing
        assert all(replica.stasher.num_stashed_catchup > 0
                   for inst_id, replica in lagging_node.replicas.items())

    # check that the catch-up is finished
    looper.run(
        eventually(lambda: assertExp(lagging_node.mode == Mode.participating),
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
            initial_all_ledgers_caught_up + 1)))

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
    assert all(replica.stasher.num_stashed_catchup == 0
               for inst_id, replica in lagging_node.replicas.items())
def test_node_catchup_after_restart_with_txns(
        sdk_new_node_caught_up,
        txnPoolNodeSet,
        tdir,
        tconf,
        sdk_node_set_with_node_added_after_some_txns,
        allPluginsPath):
    """
    A node that restarts after some transactions should eventually get the
    transactions which happened while it was down
    :return:
    """
    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_set_with_node_added_after_some_txns
    logger.debug("Stopping node {} with pool ledger size {}".
                 format(new_node, new_node.poolManager.txnSeqNo))
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node)
    looper.removeProdable(new_node)
    # for n in txnPoolNodeSet[:4]:
    #     for r in n.nodestack.remotes.values():
    #         if r.name == newNode.name:
    #             r.removeStaleCorrespondents()
    # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name,
    #                       txnPoolNodeSet[:4], retryWait=1, timeout=5))
    # TODO: Check if the node has really stopped processing requests?
    logger.debug("Sending requests")
    more_requests = 5
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, more_requests)
    logger.debug("Starting the stopped node, {}".format(new_node))
    nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha)
    config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir)
    newNode = TestNode(
        new_node.name,
        config_helper=config_helper,
        config=tconf,
        ha=nodeHa,
        cliha=nodeCHa,
        pluginPaths=allPluginsPath)
    looper.add(newNode)
    txnPoolNodeSet[-1] = newNode

    # Make sure ledger is not synced initially
    check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced)

    # Delay catchup reply processing so LedgerState does not change
    # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state'
    # fails because newNode's domain ledger state is 'synced'
    delay_catchup_reply = 10
    newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply))
    looper.run(checkNodesConnected(txnPoolNodeSet))

    # Make sure ledger starts syncing (sufficient consistency proofs received)
    looper.run(eventually(check_ledger_state, newNode, DOMAIN_LEDGER_ID,
                          LedgerState.syncing, retryWait=.5, timeout=5))

    confused_node = txnPoolNodeSet[0]
    new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID]
    cp = new_node_ledger.catchUpTill
    start, end = cp.seqNoStart, cp.seqNoEnd
    cons_proof = confused_node.ledgerManager._buildConsistencyProof(
        DOMAIN_LEDGER_ID, start, end)

    bad_send_time = None

    def chk():
        nonlocal bad_send_time
        entries = newNode.ledgerManager.spylog.getAll(
            newNode.ledgerManager.canProcessConsistencyProof.__name__)
        for entry in entries:
            # `canProcessConsistencyProof` should return False after `syncing_time`
            if entry.result == False and entry.starttime > bad_send_time:
                return
        assert False

    def send_and_chk(ledger_state):
        nonlocal bad_send_time, cons_proof
        bad_send_time = perf_counter()
        confused_node.ledgerManager.sendTo(cons_proof, newNode.name)
        # Check that the ConsistencyProof messages rejected
        looper.run(eventually(chk, retryWait=.5, timeout=5))
        check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state)

    send_and_chk(LedgerState.syncing)

    # Not accurate timeout but a conservative one
    timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \
              2 * delay_catchup_reply
    waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1],
                         customTimeout=timeout)

    send_and_chk(LedgerState.synced)
def delay_catchup(ledger_id: int):
    _delayer = cr_delay(ledger_filter=ledger_id)
    _delayer.__name__ = "delay_catchup({})".format(ledger_id)
    return _delayer
def test_unstash_three_phase_msg_after_catchup_in_view_change(
        txnPoolNodeSet, looper, tconf, sdk_pool_handle, sdk_wallet_steward):
    """
    1. Delay Commit on Node4
    2. Order 1 req
    3. Delay Commit on all nodes
    4. Order 1 req
    5. Delay CatchupRep on Node4
    6. Delay Ledger Status and ViewChangeDones on Nodes1-3
    7. Start View change on all nodes
    8. Wait until Node4 got 3 stashed CatchupReps
    9. Reset delaying of Commits on all Nodes
    10. Reset Ledger Status on Nodes1-3
    11. Check that 3 nodes finished VC while Node4 is syncing and not finished
    12. Reset CatchupRep on Node4
    13. Check that Node4 finished VC, and there was just 1 round of cacth-up (edited)
    """
    slow_node = txnPoolNodeSet[-1]
    fast_nodes = txnPoolNodeSet[:-1]
    view_no = txnPoolNodeSet[0].viewNo
    old_stashed = slow_node.master_replica.stasher.num_stashed_future_view
    last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc

    with delay_rules(
        [n.nodeIbStasher for n in txnPoolNodeSet],
            msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])):

        # Delay Commit messages for slow_node.
        slow_node.nodeIbStasher.delay(cDelay(sys.maxsize))
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)

        # Delay Commit messages for fast_nodes.
        for n in fast_nodes:
            n.nodeIbStasher.delay(cDelay(sys.maxsize))

        request2 = sdk_send_random_request(looper, sdk_pool_handle,
                                           sdk_wallet_steward)

        def check_commits(commit_key):
            for n in fast_nodes:
                for r in n.replicas.values():
                    assert commit_key in r.commits
                    assert len(r.commits[commit_key].voters) == 1

        looper.run(eventually(check_commits, (view_no, last_ordered[1] + 2)))

        # Delay CatchupRep messages for the slow_node.
        with delay_rules([slow_node.nodeIbStasher], cr_delay()):
            with delay_rules([n.nodeIbStasher for n in fast_nodes],
                             vcd_delay()):
                with delay_rules(
                    [n.nodeIbStasher for n in fast_nodes],
                        msg_rep_delay(types_to_delay=[LEDGER_STATUS])):

                    for n in txnPoolNodeSet:
                        n.view_changer.on_master_degradation()
                    looper.run(
                        eventually(
                            lambda: assertExp(slow_node.mode == Mode.syncing)))

                    # Reset delay Commit messages for all nodes.
                    for n in txnPoolNodeSet:
                        n.nodeIbStasher.reset_delays_and_process_delayeds(
                            COMMIT)

                    assert slow_node.view_change_in_progress
                    assert slow_node.mode == Mode.syncing
                    looper.run(
                        eventually(_check_nodes_stashed, fast_nodes,
                                   old_stashed,
                                   len(txnPoolNodeSet) - 1))
                    looper.run(
                        eventually(_check_nodes_stashed, [slow_node],
                                   old_stashed, (len(txnPoolNodeSet) - 1) * 2))

            waitForViewChange(looper,
                              fast_nodes,
                              expectedViewNo=view_no + 1,
                              customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
            ensureElectionsDone(looper=looper,
                                nodes=fast_nodes,
                                instances_list=range(
                                    fast_nodes[0].requiredNumberOfInstances),
                                customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
        sdk_get_and_check_replies(looper, [request2])
        waitForViewChange(looper, [slow_node],
                          expectedViewNo=view_no + 1,
                          customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
        ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
        _check_nodes_stashed(fast_nodes, old_stashed, 0)
        assert all(n.master_replica.last_ordered_3pc == (last_ordered[0],
                                                         last_ordered[1] + 2)
                   for n in txnPoolNodeSet)
        assert slow_node.catchup_rounds_without_txns == 1
def test_slow_node_reverts_unordered_state_during_catchup(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 25

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))
    # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the
    # node catch-ups 2 times.
    # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all
    # missing txns, so no stashed ones are applied
    slow_node.nodeIbStasher.delay(lsDelay(1000))

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                             delay_batches * Max3PCBatchSize)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc

    # start view change (and hence catchup)
    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        check_last_ordered_3pc(n1, n2)

    assert slow_master_replica.last_prepared_before_view_change == old_last_ordered

    old_pc_count = slow_master_replica._ordering_service.spylog.count(
        slow_master_replica._ordering_service._validate)

    assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0

    # Repair the network so COMMITs are received, processed and stashed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica._ordering_service.spylog.count(
            slow_master_replica._ordering_service._validate) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed
        assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \
               delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1)

    looper.run(eventually(chk3, retryWait=1, timeout=15))

    # fix catchup, so the node gets a chance to be caught-up
    repair_node_catchup_less(other_nodes)

    def chk4():
        # Some COMMITs were received but stashed and
        # they will processed after catchup
        assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0

    looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50))

    def chk5():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(chk5,
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    # make sure that the pool is functional
    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
Example #18
0
 def delay_catchup(catchup_state: NodeLeecherService.State):
     ledger_id = NodeLeecherService.state_to_ledger[catchup_state]
     _delayer = cr_delay(ledger_filter=ledger_id)
     _delayer.__name__ = "delay_catchup({})".format(ledger_id)
     return _delayer
Example #19
0
def test_slow_nodes_catchup_before_selecting_primary_in_new_view(
        looper, txnPoolNodeSet, steward1, stewardWallet, tconf, slow_node):
    """
    Delay 3PC to 1 node and then cause view change so by the time the view
    change happens(each node gets >n-f `INSTANCE_CHANGE`s), the slow node is
    behind other nodes. The should initiate catchup to come to the same state
    as other nodes.
    """

    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    delay = tconf.PerfCheckFreq

    # Bad network introduced
    slow_node.nodeIbStasher.delay(ppDelay(delay, 0))
    slow_node.nodeIbStasher.delay(pDelay(2 * delay, 0))
    slow_node.nodeIbStasher.delay(cDelay(3 * delay, 0))
    for i in range(2):
        sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1,
                                            20)
        waitNodeDataInequality(looper, slow_node, *fast_nodes)

    catchup_reply_counts = {
        n.name: n.ledgerManager.spylog.count(n.ledgerManager.processCatchupRep)
        for n in txnPoolNodeSet
    }
    catchup_done_counts = {
        n.name: n.spylog.count(n.allLedgersCaughtUp)
        for n in txnPoolNodeSet
    }

    def slow_node_processed_some():
        assert slow_node.master_replica.batches

    # The slow node has received some PRE-PREPAREs
    looper.run(eventually(slow_node_processed_some, retryWait=1,
                          timeout=delay))

    # No reverts have been called by the slow node
    rv = getAllReturnVals(slow_node.replicas[0],
                          TestReplica.revert_unordered_batches)
    assert not rv or max(rv) == 0

    # Delay reception of catchup replies so ViewChangeDone can be received
    # before catchup completes
    delay_catchup_reply = 2
    slow_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply))

    ensure_view_change(looper, txnPoolNodeSet)
    # `slow_node` will not have elections done but others will.
    checkProtocolInstanceSetup(looper,
                               fast_nodes,
                               numInstances=len(slow_node.replicas),
                               retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    # `slow_node` does catchup, `fast_nodes` don't
    for n in txnPoolNodeSet:
        assert n.spylog.count(
            n.allLedgersCaughtUp) > catchup_done_counts[n.name]
        if n == slow_node:
            assert n.ledgerManager.spylog.count(
                n.ledgerManager.processCatchupRep) > catchup_reply_counts[
                    n.name]
        else:
            assert n.ledgerManager.spylog.count(
                n.ledgerManager.processCatchupRep) == catchup_reply_counts[
                    n.name]

    # Greater than 0 batches were reverted by the slow node
    assert max(
        getAllReturnVals(
            slow_node.master_replica,
            slow_node.master_replica.revert_unordered_batches)) > 0

    # Bad network repaired
    slow_node.reset_delays_and_process_delayeds()

    # Make sure pool is functional
    sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)
        target_ledger_size = other_nodes[0].domainLedger.size

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))
    
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) -
                                         process_checkpoint_times_before ==
                                         (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                         (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))
    
        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.syncing

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == target_ledger_size
Example #21
0
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle,
                            sdk_wallet_client):
    """
    Before starting catchup, revert any uncommitted changes to state and
    ledger. This is to avoid any re-application of requests that were
    ordered but stashed
    Example scenario
    prepared (1, 4)
    start_view_change
    start_catchup
    ...
    ....
    ...
    committed and send Ordered (1, 2)
    ...
    ....
    preLedgerCatchUp
    force_process_ordered, take out (1,2) and stash (1, 2)
    now process stashed Ordered(1,2), its requests will be applied again

    Simulation: Delay COMMITs to a node so that it can not order requests
    but has prepared them. Then trigger a view change and make sure the slow
    node has not ordered same number of requests as others but has prepared
    so it can order when it receives COMMITs while view change is in progress.
    The slow node should revert unordered batches and but it should eventually
    process the ordered requests, so delay LEDGER_STATUS too so catchup
    is delayed
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_node.nodeIbStasher.delay(cDelay(120, 0))
    sent_batches = 5
    sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                         sdk_pool_handle, sdk_wallet_client,
                                         2 * sent_batches, sent_batches)

    # Fast nodes have same last ordered and same data
    last_ordered = [n.master_last_ordered_3PC for n in fast_nodes]
    assert check_if_all_equal_in_list(last_ordered)
    ensure_all_nodes_have_same_data(looper, fast_nodes)

    # Slow nodes have different last ordered than fast nodes
    assert last_ordered[0] != slow_node.master_last_ordered_3PC

    # Delay LEDGER_STATUS so catchup starts late
    slow_node.nodeIbStasher.delay(lsDelay(100))
    slow_node.nodeIbStasher.delay(msg_rep_delay(100))
    slow_node.nodeIbStasher.delay(cr_delay(100))

    # slow_node has not reverted batches
    assert sent_batches not in getAllReturnVals(
        slow_node.master_replica,
        slow_node.master_replica.revert_unordered_batches)

    ensure_view_change(looper, txnPoolNodeSet)

    def chk1():
        # slow_node reverted all batches
        rv = getAllReturnVals(
            slow_node.master_replica,
            slow_node.master_replica.revert_unordered_batches)
        assert sent_batches in rv

    looper.run(eventually(chk1, retryWait=1))

    # After the view change slow_node has prepared same requests as the fast
    # nodes have ordered
    assert last_ordered[
        0] == slow_node.master_replica.last_prepared_before_view_change

    # Deliver COMMITs
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # slow_node stashed commits
        assert slow_node.master_replica.stasher.num_stashed_catchup == \
               sent_batches * (len(txnPoolNodeSet) - 1)

    looper.run(eventually(chk2, retryWait=1))

    # Deliver LEDGER_STATUS so catchup can complete
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS)
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE)
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CATCHUP_REP)

    # Ensure all nodes have same data
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    def chk3():
        # slow_node processed stashed messages successfully
        assert slow_node.master_replica.stasher.num_stashed_catchup == 0

    looper.run(eventually(chk3, retryWait=1))

    # Ensure pool is functional
    sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                         sdk_pool_handle, sdk_wallet_client,
                                         10, 2)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
Example #22
0
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass,
                                         txnPoolNodeSet, sdk_pool_handle,
                                         sdk_wallet_client, allPluginsPath,
                                         chkFreqPatched):
    '''
    Test that the lagging_node can process messages from catchup stash after catchup
     and request lost messages from other nodes.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Order 2 checkpoints on rest_nodes (2 txns in 2 batches)
    sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                         sdk_pool_handle, sdk_wallet_client,
                                         2 * CHK_FREQ, 2)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher, cs_delay(),
                     msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])):
        with delay_rules(lagging_node.nodeIbStasher,
                         cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches)
            sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                                 sdk_pool_handle,
                                                 sdk_wallet_client,
                                                 2 * CHK_FREQ, 2)

        # Check that firs txn was ordered from stash after first catchup
        looper.run(
            eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[
                1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1),
                       retryWait=1,
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

        # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches)
        sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                             sdk_pool_handle,
                                             sdk_wallet_client, 2 * CHK_FREQ,
                                             2)

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    looper.run(
        eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n.
                                     master_last_ordered_3PC
                                     for n in txnPoolNodeSet)))

    # check that catch-up was started only twice
    assert lagging_node.spylog.count(
        Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_node_catchup_after_restart_with_txns(
        sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf,
        sdk_node_set_with_node_added_after_some_txns, allPluginsPath):
    """
    A node that restarts after some transactions should eventually get the
    transactions which happened while it was down
    :return:
    """
    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_set_with_node_added_after_some_txns
    logger.debug("Stopping node {} with pool ledger size {}".format(
        new_node, new_node.poolManager.txnSeqNo))
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node)
    looper.removeProdable(new_node)
    # for n in txnPoolNodeSet[:4]:
    #     for r in n.nodestack.remotes.values():
    #         if r.name == newNode.name:
    #             r.removeStaleCorrespondents()
    # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name,
    #                       txnPoolNodeSet[:4], retryWait=1, timeout=5))
    # TODO: Check if the node has really stopped processing requests?
    logger.debug("Sending requests")
    more_requests = 5
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, more_requests)
    logger.debug("Starting the stopped node, {}".format(new_node))
    nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha)
    config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir)
    newNode = TestNode(new_node.name,
                       config_helper=config_helper,
                       config=tconf,
                       ha=nodeHa,
                       cliha=nodeCHa,
                       pluginPaths=allPluginsPath)
    looper.add(newNode)
    txnPoolNodeSet[-1] = newNode

    # Make sure ledger is not synced initially
    check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced)

    # Delay catchup reply processing so LedgerState does not change
    # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state'
    # fails because newNode's domain ledger state is 'synced'
    delay_catchup_reply = 10
    newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply))
    looper.run(checkNodesConnected(txnPoolNodeSet))

    # Make sure ledger starts syncing (sufficient consistency proofs received)
    looper.run(
        eventually(check_ledger_state,
                   newNode,
                   DOMAIN_LEDGER_ID,
                   LedgerState.syncing,
                   retryWait=.5,
                   timeout=5))

    confused_node = txnPoolNodeSet[0]
    new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID]
    cp = new_node_ledger.catchUpTill
    start, end = cp.seqNoStart, cp.seqNoEnd
    cons_proof = confused_node.ledgerManager._buildConsistencyProof(
        DOMAIN_LEDGER_ID, start, end)

    bad_send_time = None

    def chk():
        nonlocal bad_send_time
        entries = newNode.ledgerManager.spylog.getAll(
            newNode.ledgerManager.canProcessConsistencyProof.__name__)
        for entry in entries:
            # `canProcessConsistencyProof` should return False after `syncing_time`
            if entry.result == False and entry.starttime > bad_send_time:
                return
        assert False

    def send_and_chk(ledger_state):
        nonlocal bad_send_time, cons_proof
        bad_send_time = perf_counter()
        confused_node.ledgerManager.sendTo(cons_proof, newNode.name)
        # Check that the ConsistencyProof messages rejected
        looper.run(eventually(chk, retryWait=.5, timeout=5))
        check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state)

    send_and_chk(LedgerState.syncing)

    # Not accurate timeout but a conservative one
    timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \
              2 * delay_catchup_reply
    waitNodeDataEquality(looper,
                         newNode,
                         *txnPoolNodeSet[:-1],
                         customTimeout=timeout)
    assert new_node_ledger.num_txns_caught_up == more_requests
    send_and_chk(LedgerState.synced)
def test_slow_node_reverts_unordered_state_during_catchup(looper,
                                                          txnPoolNodeSet,
                                                          sdk_pool_handle,
                                                          sdk_wallet_client):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 25

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))
    # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the
    # node catch-ups 2 times.
    # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all
    # missing txns, so no stashed ones are applied
    slow_node.nodeIbStasher.delay(lsDelay(1000))

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                             delay_batches * Max3PCBatchSize)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    # start view change (and hence catchup)
    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        lst_3pc = check_last_ordered_3pc(n1, n2)

    def chk1():
        # `slow_node` has prepared all 3PC messages which
        # `other_nodes` have ordered
        assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc)

    looper.run(eventually(chk1, retryWait=1))

    old_pc_count = slow_master_replica.spylog.count(
        slow_master_replica.can_process_since_view_change_in_progress)

    assert len(slow_node.stashedOrderedReqs) == 0

    # Repair the network so COMMITs are received, processed and stashed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica.spylog.count(
            slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # COMMITs are stashed
        assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize

    looper.run(eventually(chk3, retryWait=1, timeout=15))

    # fix catchup, so the node gets a chance to be caught-up
    repair_node_catchup_less(other_nodes)

    def chk4():
        # Some COMMITs were ordered but stashed and they were processed
        rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
        assert delay_batches in rv

    looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5))

    def chk5():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(
            chk5,
            retryWait=1,
            timeout=waits.expectedPoolCatchupTime(
                len(txnPoolNodeSet))))

    # make sure that the pool is functional
    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)