def test_handle_delayed_preprepares(looper, txnPoolNodeSet, wallet1,
                                    client1Connected, teardown):
    """
    Make a node send PREPREPARE again after the slow node has ordered
    """
    slow_node, other_nodes, primary_node, other_non_primary_nodes = \
        split_nodes(txnPoolNodeSet)
    # This node will send PRE-PREPARE again
    confused_node = other_non_primary_nodes[0]
    orig_method = confused_node.handlers[PREPREPARE].serve

    last_pp = None

    def patched_method(self, msg):
        nonlocal last_pp
        last_pp = orig_method(msg)
        return last_pp

    confused_node.handlers[PREPREPARE].serve = types.MethodType(
        patched_method, confused_node.handlers[PREPREPARE])

    # Delay PRE-PREPAREs by large amount simulating loss
    slow_node.nodeIbStasher.delay(ppDelay(300, 0))

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1Connected,
                                           10, 5)
    waitNodeDataEquality(looper, slow_node, *other_nodes)

    slow_master_replica = slow_node.master_replica
    count_pr_req = get_count(slow_master_replica,
                             slow_master_replica.process_requested_pre_prepare)

    count_pr_tpc = get_count(slow_master_replica,
                             slow_master_replica.processThreePhaseMsg)

    confused_node.sendToNodes(MessageRep(
        **{
            f.MSG_TYPE.nm: PREPREPARE,
            f.PARAMS.nm: {
                f.INST_ID.nm: last_pp.instId,
                f.VIEW_NO.nm: last_pp.viewNo,
                f.PP_SEQ_NO.nm: last_pp.ppSeqNo
            },
            f.MSG.nm: last_pp
        }),
                              names=[
                                  slow_node.name,
                              ])

    def chk():
        # `process_requested_pre_prepare` is called but
        # `processThreePhaseMsg` is not called
        assert get_count(
            slow_master_replica,
            slow_master_replica.process_requested_pre_prepare) > count_pr_req
        assert get_count(
            slow_master_replica,
            slow_master_replica.processThreePhaseMsg) == count_pr_tpc

    looper.run(eventually(chk, retryWait=1))
Ejemplo n.º 2
0
def test_node_catchup_after_checkpoints(looper, txnPoolNodeSet, chk_freq_patched,
                                        wallet1, client1, client1Connected,
                                        slow_node_and_others):
    """
    For some reason a node misses 3pc messages but eventually the node stashes
    some amount checkpoints and decides to catchup.
    """
    slow_node, other_nodes = slow_node_and_others
    completed_catchups_before = get_number_of_completed_catchups(slow_node)

    logger.info("Step 1: Send less than required for start the catchup procedure"
                "and check the slow node falls behind")
    batches_num = 2 * chk_freq_patched - 1
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           num_reqs=batches_num,
                                           num_batches=batches_num,
                                           )
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    logger.info("Step 2: Send remaining requests in order to trigger the catchup"
                "procedure for the slow node, then check data equality")
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           num_reqs=1
                                           )
    waitNodeDataEquality(looper, slow_node, *other_nodes)
    # check if there was a catchup
    completed_catchups_after = get_number_of_completed_catchups(slow_node)
    assert completed_catchups_after == completed_catchups_before + 1
def test_state_regenerated_from_ledger(looper, txnPoolNodeSet, client1,
                                       wallet1, client1Connected, tconf,
                                       tdirWithPoolTxns, allPluginsPath):
    """
    Node loses its state database but recreates it from ledger after start
    """
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           5 * sent_batches, sent_batches)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    node_to_stop = txnPoolNodeSet[-1]
    node_state = node_to_stop.states[DOMAIN_LEDGER_ID]
    assert not node_state.isEmpty
    state_db_path = node_state._kv.db_path
    nodeHa, nodeCHa = HA(*node_to_stop.nodestack.ha), HA(
        *node_to_stop.clientstack.ha)

    node_to_stop.stop()
    looper.removeProdable(node_to_stop)

    shutil.rmtree(state_db_path)

    restarted_node = TestNode(node_to_stop.name,
                              basedirpath=tdirWithPoolTxns,
                              config=tconf,
                              ha=nodeHa,
                              cliha=nodeCHa,
                              pluginPaths=allPluginsPath)
    looper.add(restarted_node)
    txnPoolNodeSet[-1] = restarted_node

    looper.run(checkNodesConnected(txnPoolNodeSet))
    waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
Ejemplo n.º 4
0
def test_no_preprepare_requested(looper, txnPoolNodeSet, client1,
                                 wallet1, client1Connected, teardown):
    """
    Node missing Propagates hence request not finalised, hence stashes
    PRE-PREPARE but does not request PRE-PREPARE on receiving PREPARE
    """
    slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet)
    slow_node.nodeIbStasher.delay(ppgDelay(20))
    slow_node.nodeIbStasher.delay(msg_rep_delay(20, [PROPAGATE, ]))

    old_count_resp = count_requested_preprepare_resp(slow_node)
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 4, 2)

    # The slow node is behind
    checkNodeDataForInequality(slow_node, *other_nodes)

    # PRE-PREPARE were not requested
    assert count_requested_preprepare_resp(slow_node) == old_count_resp

    slow_node.nodeIbStasher.reset_delays_and_process_delayeds()

    # The slow node has processed all requests
    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # PRE-PREPARE were not requested
    assert count_requested_preprepare_resp(slow_node) == old_count_resp
Ejemplo n.º 5
0
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, wallet1,
                                              client1, client1Connected):
    """
    The node should do only a fixed rounds of catchup. For this delay Prepares
    and Commits for 2 non-primary nodes by a large amount which is equivalent
    to loss of Prepares and Commits. Make sure 2 nodes have a different last
    prepared certificate from other two. Then do a view change, make sure view
    change completes and the pool does not process the request that were
    prepared by only a subset of the nodes
    """
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    ledger_summary = txnPoolNodeSet[0].elector.ledger_summary

    slow_nodes = [
        r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)[-2:]
    ]
    fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes]

    # Make node slow to process Prepares and Commits
    for node in slow_nodes:
        node.nodeIbStasher.delay(pDelay(120, 0))
        node.nodeIbStasher.delay(cDelay(120, 0))

    sendRandomRequests(wallet1, client1, 5)
    looper.runFor(3)

    ensure_view_change(looper, nodes=txnPoolNodeSet)

    def last_prepared(nodes):
        lst = [
            n.master_replica.last_prepared_certificate_in_view() for n in nodes
        ]
        # All nodes have same last prepared
        assert check_if_all_equal_in_list(lst)
        return lst[0]

    last_prepared_slow = last_prepared(slow_nodes)
    last_prepared_fast = last_prepared(fast_nodes)

    # Check `slow_nodes` and `fast_nodes` set different last_prepared
    assert last_prepared_fast != last_prepared_slow

    # View change complete
    ensureElectionsDone(looper, txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    # The requests which were prepared by only a subset of the nodes were
    # not ordered
    assert txnPoolNodeSet[0].elector.ledger_summary == ledger_summary

    for node in slow_nodes:
        node.nodeIbStasher.reset_delays_and_process_delayeds()

    # Make sure pool is functional
    ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    last_prepared(txnPoolNodeSet)
Ejemplo n.º 6
0
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle,
                                    sdk_wallet_client):
    """
    A node is slow to receive COMMIT messages so after a view change it
    starts catchup. But before it can start requesting txns, the COMMITs messages
    are received and are ordered. The node should not request any transactions.
    :return:
    """
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle, sdk_wallet_client,
                                           2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    delay_cm = 30
    delat_cp = 100
    slow_node.nodeIbStasher.delay(cDelay(delay_cm))
    # The slow node receives consistency proofs after some delay, this delay
    # gives the opportunity to deliver all 3PC messages
    slow_node.nodeIbStasher.delay(cpDelay(delat_cp))

    # Count of `getCatchupReqs` which is called to construct the `CatchupReq`
    # to be sent
    def domain_cr_count():
        return sum(1 for entry in slow_node.ledgerManager.spylog.getAll(
            slow_node.ledgerManager.getCatchupReqs)
                   if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID)

    old_count = domain_cr_count()
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle, sdk_wallet_client,
                                           2 * sent_batches, sent_batches)
    ensure_view_change(looper, nodes=txnPoolNodeSet)

    # After view change, the `slow_node` is behind
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    # Unstash only COMMIT messages
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename)

    looper.runFor(2)

    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(
        ConsistencyProof.typename)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # No `CatchupReq`s constructed, hence no `CatchupReq`s could have
    # been sent
    assert domain_cr_count() == old_count
    # Some stashed ordered requests have been processed
    rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
    assert sent_batches in rv

    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client,
                               sdk_pool_handle)
def ensure_pool_functional(looper,
                           nodes,
                           wallet,
                           client,
                           num_reqs=10,
                           num_batches=2):
    send_reqs_batches_and_get_suff_replies(looper, wallet, client, num_reqs,
                                           num_batches)
    ensure_all_nodes_have_same_data(looper, nodes)
Ejemplo n.º 8
0
def test_node_request_preprepare(looper, txnPoolNodeSet, client1,
                                 wallet1, client1Connected, teardown):
    """
    Node requests PRE-PREPARE only once after getting PREPAREs.
    """
    slow_node, other_nodes, primary_node, \
        other_primary_nodes = split_nodes(txnPoolNodeSet)
    # Drop PrePrepares and Prepares
    slow_node.nodeIbStasher.delay(ppDelay(300, 0))
    slow_node.nodeIbStasher.delay(pDelay(300, 0))

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10, 5)
    slow_node.nodeIbStasher.drop_delayeds()
    slow_node.nodeIbStasher.resetDelays()

    old_count_req = count_requested_preprepare_req(slow_node)
    old_count_resp = count_requested_preprepare_resp(slow_node)

    def chk(increase=True):
        # Method is called
        assert count_requested_preprepare_req(slow_node) > old_count_req
        # Requesting Preprepare
        assert count_requested_preprepare_resp(
            slow_node) - old_count_resp == (1 if increase else 0)

    for pp in primary_node.master_replica.sentPrePrepares.values():
        for rep in [n.master_replica for n in other_primary_nodes]:
            prepare = Prepare(rep.instId,
                              pp.viewNo,
                              pp.ppSeqNo,
                              pp.ppTime,
                              pp.digest,
                              pp.stateRootHash,
                              pp.txnRootHash
                              )
            rep.send(prepare)

        looper.run(eventually(chk, True, retryWait=1))

        old_count_resp = count_requested_preprepare_resp(slow_node)

        prepare = Prepare(rep.instId,
                          pp.viewNo,
                          pp.ppSeqNo,
                          pp.ppTime,
                          pp.digest,
                          pp.stateRootHash,
                          pp.txnRootHash
                          )
        rep.send(prepare)

        looper.run(eventually(chk, False, retryWait=1))

        old_count_req = count_requested_preprepare_req(slow_node)

        old_count_resp = count_requested_preprepare_resp(slow_node)
def test_discard_3PC_messages_for_already_ordered(looper, txnPoolNodeSet,
                                                  client1, wallet1,
                                                  client1Connected):
    """
    Nodes discard any 3PC messages for already ordered 3PC keys
    (view_no, pp_seq_no). Delay all 3PC messages to a node so it cannot respond
    to them unless the other nodes order them, now when the slow node will get
    them it will respond but other nodes will not process them and discard them
    """
    slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1]
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    delay = 20
    delay_3pc_messages([slow_node], 0, delay)
    delay_3pc_messages([slow_node], 1, delay)

    sent_batches = 3
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           2 * sent_batches, sent_batches)

    def chk(node, inst_id, p_count, c_count):
        # A node will still record PREPRAREs even if more than n-f-1, till the
        # request is not ordered
        assert len(node.replicas[inst_id].prepares) >= p_count
        assert len(node.replicas[inst_id].commits) == c_count

    def count_discarded(inst_id, count):
        for node in other_nodes:
            assert countDiscarded(node.replicas[inst_id],
                                  'already ordered 3 phase message') == count

    # `slow_node` did not receive any PREPAREs or COMMITs
    chk(slow_node, 0, 0, 0)

    # `other_nodes` have not discarded any 3PC message
    count_discarded(0, 0)

    # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node`
    chk_commits_prepares_recvd(0, other_nodes, slow_node)

    slow_node.reset_delays_and_process_delayeds()
    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # `slow_node` did receive correct number of PREPAREs and COMMITs
    looper.run(
        eventually(chk,
                   slow_node,
                   0,
                   sent_batches - 1,
                   sent_batches,
                   retryWait=1))

    # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node`
    chk_commits_prepares_recvd(0, other_nodes, slow_node)

    # `other_nodes` have discarded PREPAREs and COMMITs all batches
    count_discarded(0, 2 * sent_batches)
Ejemplo n.º 10
0
def test_lag_size_for_catchup(looper, chkFreqPatched, reqs_for_checkpoint,
                              txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client):
    """
    Verifies that if the stored own checkpoints have aligned bounds then
    the master replica lag which makes the node perform catch-up is
    Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received
    checkpoints.
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    # The master replica of the slow node stops to receive 3PC-messages
    slow_node.master_replica.threePhaseRouter.extend((
        (PrePrepare, lambda *x, **y: None),
        (Prepare, lambda *x, **y: None),
        (Commit, lambda *x, **y: None),
    ))

    completed_catchups_before_reqs = get_number_of_completed_catchups(
        slow_node)

    # Send requests for the slow node's master replica to get
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints
    # from others
    send_reqs_batches_and_get_suff_replies(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint)

    # Give time for the slow node to catch up if it is going to do it
    looper.runFor(
        waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
        waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    checkNodeDataForInequality(slow_node, *other_nodes)

    # Verify that the slow node has not caught up
    assert get_number_of_completed_catchups(
        slow_node) == completed_catchups_before_reqs

    # Send more requests for the slow node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle, sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper,
                         slow_node,
                         *other_nodes,
                         exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the slow node has caught up
    assert get_number_of_completed_catchups(
        slow_node) > completed_catchups_before_reqs
Ejemplo n.º 11
0
def test_view_change_done_delayed(txnPoolNodeSet, looper, wallet1, client1,
                                  client1Connected):
    """
    A node is slow so is behind other nodes, after view change, it catches up
    but it also gets view change message as delayed, a node should start
    participating only when caught up and ViewChangeCone quorum received.
    """
    nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)]
    slow_node = nprs[-1]
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    delay_3pc = 10
    delay_vcd = 25
    delay_3pc_messages([slow_node], 0, delay_3pc)
    slow_node.nodeIbStasher.delay(vcd_delay(delay_vcd))

    def chk(node):
        assert node.elector.has_acceptable_view_change_quorum
        assert node.elector._primary_verified
        assert node.isParticipating
        assert None not in {r.isPrimary for r in node.replicas}

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 5 * 4, 4)

    ensure_view_change(looper, nodes=txnPoolNodeSet)

    # After view change, the slow node successfully completes catchup
    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # Other nodes complete view change, select primary and participate
    for node in other_nodes:
        looper.run(eventually(chk, node, retryWait=1))

    # Since `ViewChangeCone` is delayed, slow_node is not able to select primary
    # and participate
    assert not slow_node.elector.has_acceptable_view_change_quorum
    assert not slow_node.elector._primary_verified
    assert not slow_node.isParticipating
    assert {r.isPrimary for r in slow_node.replicas} == {None}

    # Send requests to make sure pool is functional
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)

    # Repair network
    slow_node.reset_delays_and_process_delayeds()

    # `slow_node` selects primary and participate
    looper.run(eventually(chk, slow_node, retryWait=1))

    # Processes requests received during lack of primary
    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # Send more requests and compare data of all nodes
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepare_malicious(looper, txnPoolNodeSet,
                                                    client1, wallet1,
                                                    client1Connected,
                                                    malicious_setup, teardown):
    """
    A node has bad network with primary and thus loses PRE-PREPARE,
    it requests PRE-PREPARE from non-primaries once it has sufficient PREPAREs
    but one of the non-primary does not send the PRE-PREPARE
    """
    # primary_node = get_master_primary_node(txnPoolNodeSet)
    # slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    # other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    # bad_node = [n for n in other_nodes if n != primary_node][0]
    # good_non_primary_node = [n for n in other_nodes if n != slow_node
    #                          and n != bad_node and n != primary_node][0]
    primary_node, bad_node, good_non_primary_node, slow_node, other_nodes, \
    bad_method, orig_method = malicious_setup

    slow_node.nodeIbStasher.delay(ppDelay(300, 0))

    def get_reply_count_frm(node):
        return sum([
            1
            for entry in slow_node.spylog.getAll(slow_node.process_message_rep)
            if entry.params['msg'].msg_type == PREPREPARE
            and entry.params['frm'] == node.name
        ])

    old_reply_count_from_bad_node = get_reply_count_frm(bad_node)
    old_reply_count_from_good_node = get_reply_count_frm(good_non_primary_node)
    old_discarded = countDiscarded(slow_node.master_replica, 'does not have '
                                   'expected state')

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10, 2)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    assert check_if_all_equal_in_list(
        [n.master_replica.ordered for n in txnPoolNodeSet])

    assert not slow_node.master_replica.requested_pre_prepares

    if bad_method.__name__ == 'do_not_send':
        assert get_reply_count_frm(bad_node) == old_reply_count_from_bad_node
    else:
        assert countDiscarded(slow_node.master_replica,
                              'does not have expected state') > old_discarded

    assert get_reply_count_frm(good_non_primary_node) > \
           old_reply_count_from_good_node

    slow_node.reset_delays_and_process_delayeds()
    bad_node.nodeMsgRouter.routes[MessageReq] = orig_method
def test_lag_size_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_client):
    """
    Verifies that if the stored own checkpoints have aligned bounds then
    the master replica lag which makes the node perform catch-up is
    Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received
    checkpoints.
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    # The master replica of the slow node stops to receive 3PC-messages
    slow_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(slow_node)

    # Send requests for the slow node's master replica to get
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints
    # from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the slow node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    checkNodeDataForInequality(slow_node, *other_nodes)

    # Verify that the slow node has not caught up
    assert get_number_of_completed_catchups(slow_node) == completed_catchups_before_reqs

    # Send more requests for the slow node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # Verify that the slow node has caught up
    assert get_number_of_completed_catchups(slow_node) > completed_catchups_before_reqs
Ejemplo n.º 14
0
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet,
                                                client1, wallet1):
    """
    A node does not order requests since it is missing some 3PC messages,
    gets them from catchup. It then clears them from its request queues
    """
    slow_node, fast_nodes = setup

    reqs = send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 10,
                                                  5)
    ensure_all_nodes_have_same_data(looper, fast_nodes)
    assert slow_node.master_replica.last_ordered_3pc != \
        fast_nodes[0].master_replica.last_ordered_3pc

    def chk(key, nodes, present):
        for node in nodes:
            assert (key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID]
                    ) == present

    for req in reqs:
        chk(req.key, fast_nodes, False)
        chk(req.key, [slow_node], True)

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    for req in reqs:
        chk(req.key, txnPoolNodeSet, False)
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet,
                                                client1, wallet1):
    """
    A node does not order requests since it is missing some 3PC messages,
    gets them from catchup. It then clears them from its request queues
    """
    slow_node, fast_nodes = setup

    reqs = send_reqs_batches_and_get_suff_replies(
        looper, wallet1, client1, 10, 5)
    ensure_all_nodes_have_same_data(looper, fast_nodes)
    assert slow_node.master_replica.last_ordered_3pc != \
        fast_nodes[0].master_replica.last_ordered_3pc

    def chk(key, nodes, present):
        for node in nodes:
            assert (
                key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID]) == present

    for req in reqs:
        chk(req.key, fast_nodes, False)
        chk(req.key, [slow_node], True)

    # Reset catchup reply delay so that  catchup can complete
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CatchupRep.typename)

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    for req in reqs:
        chk(req.key, txnPoolNodeSet, False)

    # Needed for the next run due to the parametrised fixture
    slow_node.reset_delays_and_process_delayeds()
def test_node_requests_missing_preprepare(looper, txnPoolNodeSet, client1,
                                          wallet1, client1Connected, teardown):
    """
    A node has bad network with primary and thus loses PRE-PREPARE,
    it requests PRE-PREPARE from non-primaries once it has sufficient PREPAREs
    """
    slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet)

    # Delay PRE-PREPAREs by large amount simulating loss
    slow_node.nodeIbStasher.delay(ppDelay(300, 0))
    old_count_pp = get_count(slow_node.master_replica,
                             slow_node.master_replica.processPrePrepare)
    old_count_mrq = {
        n.name: get_count(n, n.process_message_req)
        for n in other_nodes
    }
    old_count_mrp = get_count(slow_node, slow_node.process_message_rep)

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 15, 5)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    assert not slow_node.master_replica.requested_pre_prepares

    # `slow_node` processed PRE-PREPARE
    assert get_count(slow_node.master_replica,
                     slow_node.master_replica.processPrePrepare) > old_count_pp

    # `slow_node` did receive `MessageRep`
    assert get_count(slow_node, slow_node.process_message_rep) > old_count_mrp

    # More than `f` nodes received `MessageReq`
    recv_reqs = set()
    for n in other_nodes:
        if get_count(n, n.process_message_req) > old_count_mrq[n.name]:
            recv_reqs.add(n.name)

    assert len(recv_reqs) > slow_node.f

    # All nodes including the `slow_node` ordered the same requests
    assert check_if_all_equal_in_list(
        [n.master_replica.ordered for n in txnPoolNodeSet])
def test_state_regenerated_from_ledger(
        looper,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        tdir,
        tconf,
        allPluginsPath):
    """
    Node loses its state database but recreates it from ledger after start
    """
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           5 * sent_batches,
                                           sent_batches)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    node_to_stop = txnPoolNodeSet[-1]
    node_state = node_to_stop.states[DOMAIN_LEDGER_ID]
    assert not node_state.isEmpty
    state_db_path = node_state._kv.db_path
    nodeHa, nodeCHa = HA(*
                         node_to_stop.nodestack.ha), HA(*
                                                        node_to_stop.clientstack.ha)

    node_to_stop.stop()
    looper.removeProdable(node_to_stop)

    shutil.rmtree(state_db_path)

    config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir)
    restarted_node = TestNode(node_to_stop.name,
                              config_helper=config_helper,
                              config=tconf, ha=nodeHa, cliha=nodeCHa,
                              pluginPaths=allPluginsPath)
    looper.add(restarted_node)
    txnPoolNodeSet[-1] = restarted_node

    looper.run(checkNodesConnected(txnPoolNodeSet))
    waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
Ejemplo n.º 18
0
def test_checkpoint_across_views(sent_batches, chkFreqPatched, looper,
                                 txnPoolNodeSet, client1, wallet1,
                                 client1Connected):
    """
    Test checkpointing across views.
    This test checks that checkpointing and garbage collection works correctly
    no matter if view change happened before a checkpoint or after a checkpoint
    """
    batch_size = 2
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           batch_size*sent_batches,
                                           sent_batches)

    # Check that correct garbage collection happens
    non_gced_batch_count = (sent_batches - CHK_FREQ) if sent_batches >= CHK_FREQ else sent_batches
    looper.run(eventually(checkRequestCounts, txnPoolNodeSet,
                          batch_size*non_gced_batch_count, non_gced_batch_count,
                          non_gced_batch_count,
                          retryWait=1))

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    # Check that after view change, proper clean up is done
    for node in txnPoolNodeSet:
        for r in node.replicas:
            assert not r.checkpoints
            # No stashed checkpoint for previous view
            assert not [view_no for view_no in r.stashedRecvdCheckpoints if view_no < r.viewNo]
            assert r._h == 0
            assert r._lastPrePrepareSeqNo == 0
            assert r.h == 0
            assert r.H == r._h + chkFreqPatched.LOG_SIZE

    checkRequestCounts(txnPoolNodeSet, 0, 0, 0)

    # Even after view change, chekpointing works
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                                  batch_size*sent_batches,
                                                  sent_batches)

    looper.run(eventually(checkRequestCounts, txnPoolNodeSet,
                          batch_size * non_gced_batch_count,
                          non_gced_batch_count,
                          non_gced_batch_count,
                          retryWait=1))

    # Send more batches so one more checkpoint happens. This is done so that
    # when this test finishes, all requests are garbage collected and the
    # next run of this test (with next param) has the calculations correct
    more = CHK_FREQ - non_gced_batch_count
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           batch_size * more,
                                           more)
    looper.run(eventually(checkRequestCounts, txnPoolNodeSet, 0, 0, 0,
                          retryWait=1))
def test_node_catchup_after_checkpoints(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and decides to catchup.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of requests")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node gets requests but cannot process them because of "
        "missed ones. But the nodes eventually stashes some amount checkpoints "
        "after that the node starts catch up")

    repaired_node = repair_broken_node(broken_node)
    completed_catchups_before = get_number_of_completed_catchups(broken_node)

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                           reqs_for_checkpoint - max_batch_size)

    waitNodeDataEquality(looper, repaired_node, *other_nodes)
    # Note that the repaired node might not fill the gap of missed 3PC-messages
    # by requesting them from other nodes because these 3PC-batches start from
    # an already stabilized checkpoint, so a part of these 3PC-messages are
    # already unavailable

    # Verify that a catch-up was done
    completed_catchups_after = get_number_of_completed_catchups(repaired_node)
    assert completed_catchups_after > completed_catchups_before

    logger.info("Step 3: Check if the node is able to process requests")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataEquality(looper, repaired_node, *other_nodes)
Ejemplo n.º 20
0
def test_commits_recvd_first(looper, txnPoolNodeSet, client1, wallet1,
                             client1Connected):
    slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1]
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    delay = 50
    slow_node.nodeIbStasher.delay(ppDelay(delay, 0))
    slow_node.nodeIbStasher.delay(pDelay(delay, 0))

    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 20, 4)

    assert not slow_node.master_replica.prePrepares
    assert not slow_node.master_replica.prepares
    assert not slow_node.master_replica.commits
    assert len(slow_node.master_replica.commitsWaitingForPrepare) > 0

    slow_node.reset_delays_and_process_delayeds()
    waitNodeDataEquality(looper, slow_node, *other_nodes)
    assert check_if_all_equal_in_list(
        [n.master_replica.ordered for n in txnPoolNodeSet])

    assert slow_node.master_replica.prePrepares
    assert slow_node.master_replica.prepares
    assert slow_node.master_replica.commits
    assert not slow_node.master_replica.commitsWaitingForPrepare
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet,
                                                sdk_pool_handle,
                                                sdk_wallet_client):
    """
    A node does not order requests since it is missing some 3PC messages,
    gets them from catchup. It then clears them from its request queues
    """
    slow_node, fast_nodes = setup

    reqs = sdk_json_couples_to_request_list(
        send_reqs_batches_and_get_suff_replies(
            looper, txnPoolNodeSet,
            sdk_pool_handle,
            sdk_wallet_client,
            10,
            5))
    ensure_all_nodes_have_same_data(looper, fast_nodes)
    assert slow_node.master_replica.last_ordered_3pc != \
           fast_nodes[0].master_replica.last_ordered_3pc

    def chk(key, nodes, present):
        for node in nodes:
            assert (
                           key in node.master_replica.requestQueues[DOMAIN_LEDGER_ID]) == present

    for req in reqs:
        chk(req.digest, fast_nodes, False)
        chk(req.digest, [slow_node], True)

    # Reset catchup reply delay so that  catchup can complete
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CatchupRep.typename)

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    for req in reqs:
        chk(req.digest, txnPoolNodeSet, False)

    # Needed for the next run due to the parametrised fixture
    slow_node.reset_delays_and_process_delayeds()
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet,
                                                sdk_pool_handle,
                                                sdk_wallet_client):
    """
    A node does not order requests since it is missing some 3PC messages,
    gets them from catchup. It then clears them from its request queues
    """
    slow_node, fast_nodes = setup

    reqs = sdk_json_couples_to_request_list(
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client, 10, 5))
    ensure_all_nodes_have_same_data(looper, fast_nodes)
    assert slow_node.master_replica.last_ordered_3pc != \
           fast_nodes[0].master_replica.last_ordered_3pc

    def chk(key, nodes, present):
        for node in nodes:
            assert (key in node.master_replica._ordering_service.
                    requestQueues[DOMAIN_LEDGER_ID]) == present

    for req in reqs:
        chk(req.digest, fast_nodes, False)
        chk(req.digest, [slow_node], True)

    # Reset catchup reply delay so that  catchup can complete
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(
        CatchupRep.typename)

    slow_node.start_catchup()

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    for req in reqs:
        chk(req.digest, txnPoolNodeSet, False)

    # Needed for the next run due to the parametrised fixture
    slow_node.reset_delays_and_process_delayeds()
def test_node_catchup_after_checkpoints(looper, chk_freq_patched,
                                        txnPoolNodeSet, wallet1, client1,
                                        client1Connected,
                                        broken_node_and_others):
    """
    For some reason a node misses 3pc messages but eventually the node stashes
    some amount checkpoints and decides to catchup.
    """
    broken_node, other_nodes = broken_node_and_others
    completed_catchups_before = get_number_of_completed_catchups(broken_node)

    logger.info("Step 1: The node misses quite a lot requests")
    send_reqs_batches_and_get_suff_replies(
        looper,
        wallet1,
        client1,
        num_reqs=chk_freq_patched + 1,
        num_batches=chk_freq_patched + 1,
    )
    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node gets requests but cannot process them because of "
        "missed ones. But the nodes eventually stashes some amount checkpoints "
        "after that the node starts catch up")
    repaired_node = repair_broken_node(broken_node)
    send_reqs_batches_and_get_suff_replies(looper,
                                           wallet1,
                                           client1,
                                           num_reqs=2 * chk_freq_patched,
                                           num_batches=2 * chk_freq_patched)
    waitNodeDataEquality(looper, repaired_node, *other_nodes)

    # check if there was at least 1 catchup
    completed_catchups_after = get_number_of_completed_catchups(repaired_node)
    assert completed_catchups_after >= completed_catchups_before + 1

    logger.info("Step 3: Check if the node is able to process requests")
    send_reqs_batches_and_get_suff_replies(looper,
                                           wallet1,
                                           client1,
                                           num_reqs=chk_freq_patched + 2,
                                           num_batches=chk_freq_patched + 2)
    waitNodeDataEquality(looper, repaired_node, *other_nodes)
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet,
                                    sdk_pool_handle,
                                    sdk_wallet_client):
    """
    A node is slow to receive COMMIT messages so after a view change it
    starts catchup. But before it can start requesting txns, the COMMITs messages
    are received and are ordered. The node should not request any transactions.
    :return:
    """
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           2 * 3,
                                           3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    delay_cm = 30
    delat_cp = 100
    slow_node.nodeIbStasher.delay(cDelay(delay_cm))
    # The slow node receives consistency proofs after some delay, this delay
    # gives the opportunity to deliver all 3PC messages
    slow_node.nodeIbStasher.delay(cpDelay(delat_cp))

    # Count of `getCatchupReqs` which is called to construct the `CatchupReq`
    # to be sent
    def domain_cr_count(): return sum(1 for entry in
                                      slow_node.ledgerManager.spylog.getAll(
                                          slow_node.ledgerManager.getCatchupReqs) if
                                      entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID)

    old_count = domain_cr_count()
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           2 * sent_batches,
                                           sent_batches)
    ensure_view_change(looper, nodes=txnPoolNodeSet)

    # After view change, the `slow_node` is behind
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    # Unstash only COMMIT messages
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename)

    looper.runFor(2)

    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(
        ConsistencyProof.typename)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # No `CatchupReq`s constructed, hence no `CatchupReq`s could have
    # been sent
    assert domain_cr_count() == old_count
    # Some stashed ordered requests have been processed
    rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
    assert sent_batches in rv

    sdk_ensure_pool_functional(looper, txnPoolNodeSet,
                               sdk_wallet_client,
                               sdk_pool_handle)
def test_complete_short_checkpoint_not_included_in_lag_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client,
        tdir, tconf, allPluginsPath):
    """
    Verifies that if the first stored own checkpoint has a not aligned lower
    bound (this means that it was started after a catch-up), is complete
    and there is a quorumed stashed checkpoint from other replicas with
    the same end then this stashed checkpoint is not included into the lag
    for a catch-up, i.e. in such a case the lag which makes the node perform
    catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    received checkpoints.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize

    _, new_node = sdk_add_new_steward_and_node(
        looper, sdk_pool_handle, sdk_wallet_steward,
        'EpsilonSteward', 'Epsilon', tdir, tconf,
        allPluginsPath=allPluginsPath)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
    # Epsilon did not participate in ordering of the batch with EpsilonSteward
    # NYM transaction and the batch with Epsilon NODE transaction.
    # Epsilon got these transactions via catch-up.

    # To complete the first checkpoint send requests for 1 checkpoint minus
    # 2 3PC-batches (since there are already 2 3PC-batches in the first
    # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE
    # transaction). This checkpoint has a not aligned lower bound
    # on the new node replicas so it will not be stabilized on them.
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint - 2 * max_batch_size)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])

    # The master replica of the new node stops to receive 3PC-messages
    new_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(new_node)

    # Send requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the new node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1])

    # Verify that the new node has not caught up
    assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs

    # Send more requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])

    # Verify that the new node has caught up
    assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_complete_short_checkpoint_not_included_in_lag_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client,
        tdir, tconf, allPluginsPath):
    """
    Verifies that if the first stored own checkpoint has a not aligned lower
    bound (this means that it was started after a catch-up), is complete
    and there is a quorumed stashed checkpoint from other replicas with
    the same end then this stashed checkpoint is not included into the lag
    for a catch-up, i.e. in such a case the lag which makes the node perform
    catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    received checkpoints.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize

    _, new_node = sdk_add_new_steward_and_node(
        looper, sdk_pool_handle, sdk_wallet_steward,
        'EpsilonSteward', 'Epsilon', tdir, tconf,
        allPluginsPath=allPluginsPath)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])
    # Epsilon did not participate in ordering of the batch with EpsilonSteward
    # NYM transaction and the batch with Epsilon NODE transaction.
    # Epsilon got these transactions via catch-up.

    # To complete the first checkpoint send requests for 1 checkpoint minus
    # 2 3PC-batches (since there are already 2 3PC-batches in the first
    # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE
    # transaction). This checkpoint has a not aligned lower bound
    # on the new node replicas so it will not be stabilized on them.
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint - 2 * max_batch_size)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # The master replica of the new node stops to receive 3PC-messages
    new_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(new_node)

    # Send requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the new node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the new node has not caught up
    assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs

    # Send more requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the new node has caught up
    assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
Ejemplo n.º 27
0
def test_reverted_unordered(txnPoolNodeSet, looper, wallet1, client1,
                            client1Connected):
    """
    Before starting catchup, revert any uncommitted changes to state and
    ledger. This is to avoid any re-application of requests that were
    ordered but stashed
    Example scenario
    prepared (1, 4)
    startViewChange
    start_catchup
    ...
    ....
    ...
    committed and send Ordered (1, 2)
    ...
    ....
    preLedgerCatchUp
    force_process_ordered, take out (1,2) and stash (1, 2)
    now process stashed Ordered(1,2), its requests will be applied again

    Simulation: Delay COMMITs to a node so that it can not order requests
    but has prepared them. Then trigger a view change and make sure the slow
    node has not ordered same number of requests as others but has prepared
    so it can order when it receives COMMITs while view change is in progress.
    The slow node should revert unordered batches and but it should eventually
    process the ordered requests, so delay LEDGER_STATUS too so catchup
    is delayed
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_node.nodeIbStasher.delay(cDelay(120, 0))
    sent_batches = 5
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           2 * sent_batches, sent_batches)

    # Fast nodes have same last ordered and same data
    last_ordered = [n.master_last_ordered_3PC for n in fast_nodes]
    assert check_if_all_equal_in_list(last_ordered)
    ensure_all_nodes_have_same_data(looper, fast_nodes)

    # Slow nodes have different last ordered than fast nodes
    assert last_ordered[0] != slow_node.master_last_ordered_3PC

    # Delay LEDGER_STATUS so catchup starts late
    slow_node.nodeIbStasher.delay(lsDelay(100))
    slow_node.nodeIbStasher.delay(msg_rep_delay(100))

    # slow_node has not reverted batches
    assert sent_batches not in getAllReturnVals(
        slow_node.master_replica,
        slow_node.master_replica.revert_unordered_batches)

    ensure_view_change(looper, txnPoolNodeSet)

    def chk1():
        # slow_node reverted all batches
        rv = getAllReturnVals(
            slow_node.master_replica,
            slow_node.master_replica.revert_unordered_batches)
        assert sent_batches in rv

    looper.run(eventually(chk1, retryWait=1))

    # After the view change slow_node has prepared same requests as the fast
    # nodes have ordered
    assert last_ordered[
        0] == slow_node.master_replica.last_prepared_before_view_change

    # Deliver COMMITs
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # slow_node orders all requests as others have
        assert last_ordered[0] == slow_node.master_last_ordered_3PC

    looper.run(eventually(chk2, retryWait=1))

    # Deliver LEDGER_STATUS so catchup can complete
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS)
    slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE)

    # Ensure all nodes have same data
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)

    def chk3():
        # slow_node processed stashed Ordered requests successfully
        rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
        assert sent_batches in rv

    looper.run(eventually(chk3, retryWait=1))

    # Ensure pool is functional
    ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def test_node_detecting_lag_from_view_change_done_messages(
        txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf):
    """
    A node is slow and after view change starts, it marks it's `last_prepared`
    to less than others, after catchup it does not get any txns from others
    and finds it has already ordered it's `last_prepared`, but when
    it gets ViewChangeDone messages, it starts catchup again and this
    time gets the txns. To achieve this delay all 3PC messages to a node so
    before view change it has different last_prepared from others.
    Also delay processing of COMMITs and INSTANCE_CHANGEs by other nodes
    """
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    fast_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica
    fast_master_replicas = [n.master_replica for n in fast_nodes]

    delay_3pc = 50
    delay_ic = tconf.PerfCheckFreq + 5
    delay_commit = delay_ic + 10
    delay_3pc_messages([slow_node], 0, delay_3pc)
    for n in fast_nodes:
        n.nodeIbStasher.delay(icDelay(delay_ic))
        n.nodeIbStasher.delay(cDelay(delay_commit))

    reqs = []
    for i in range(10):
        reqs = reqs + sendRandomRequests(wallet1, client1, 2)
        looper.runFor(.2)

    def chk1():
        for rep in fast_master_replicas:
            assert compare_3PC_keys(
                slow_master_replica.last_prepared_certificate_in_view(),
                rep.last_prepared_certificate_in_view()) > 0
            assert slow_master_replica.last_ordered_3pc == rep.last_ordered_3pc

    looper.run(eventually(chk1))

    no_more_catchup_count = get_count(slow_node,
                                      slow_node.no_more_catchups_needed)

    # Track last prepared for master replica of each node
    prepareds = {}
    orig_methods = {}
    for node in txnPoolNodeSet:
        orig_methods[node.name] = node.master_replica.on_view_change_start

        def patched_on_view_change_start(self):
            orig_methods[self.node.name]()
            prepareds[self.node.name] = self.last_prepared_before_view_change

        node.master_replica.on_view_change_start = types.MethodType(
            patched_on_view_change_start, node.master_replica)

    ensure_view_change(looper, txnPoolNodeSet, exclude_from_check=fast_nodes)

    def chk2():
        # last_prepared of slow_node is less than fast_nodes
        for rep in fast_master_replicas:
            assert compare_3PC_keys(prepareds[slow_master_replica.node.name],
                                    prepareds[rep.node.name]) > 0

    looper.run(eventually(chk2, timeout=delay_ic + 5))

    last_start_catchup_call_at = None
    no_more_catchup_call_at = None

    def chk3():
        # no_more_catchups_needed was called since node found no need of catchup
        nonlocal last_start_catchup_call_at, no_more_catchup_call_at
        assert (get_count(slow_node, slow_node.no_more_catchups_needed) -
                no_more_catchup_count) > 0

        no_more_catchup_call_at = slow_node.spylog.getLast(
            slow_node.no_more_catchups_needed).starttime
        last_start_catchup_call_at = slow_node.spylog.getLast(
            slow_node.start_catchup).starttime

    looper.run(eventually(chk3, timeout=delay_commit))

    for n in fast_nodes:
        n.nodeIbStasher.reset_delays_and_process_delayeds()
        n.nodeIbStasher.reset_delays_and_process_delayeds()

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)

    assert slow_node.spylog.getLast(
        slow_node.start_catchup).starttime > no_more_catchup_call_at
    assert slow_node.spylog.getLast(
        slow_node.start_catchup).starttime > last_start_catchup_call_at
Ejemplo n.º 29
0
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))

        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = \
            repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(
                repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) -
                process_checkpoint_times_before ==
                (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))

        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.discovering

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)
        target_ledger_size = other_nodes[0].domainLedger.size

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))
    
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) -
                                         process_checkpoint_times_before ==
                                         (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                         (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))
    
        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.syncing

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == target_ledger_size