def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet,
                                            nodes_slow_to_process_catchup_reqs,
                                            sdk_node_created_after_some_txns):
    """
    A new node that joins after some transactions should stash new transactions
    until it has caught up
    :return:
    """

    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_created_after_some_txns
    txnPoolNodeSet.append(new_node)
    old_nodes = txnPoolNodeSet[:-1]
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, 5)
    chk_commits_prepares_recvd(0, old_nodes, new_node)

    for node in old_nodes:
        node.reset_delays_and_process_delayeds()

    timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \
              catchup_delay + \
              waits.expectedPoolElectionTimeout(len(txnPoolNodeSet))
    ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout)
    waitNodeDataEquality(looper, new_node, *old_nodes)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, 2)

    # Commits and Prepares are received by all old nodes
    with pytest.raises(AssertionError):
        # Since nodes discard 3PC messages for already ordered requests.
        chk_commits_prepares_recvd(0, old_nodes, new_node)
    waitNodeDataEquality(looper, new_node, *old_nodes)
def testNodeRejectingInvalidTxns(tconf, txnPoolNodeSet, patched_node,
                                 nodeCreatedAfterSomeTxns):
    """
    A newly joined node is catching up and sends catchup requests to other
    nodes but one of the nodes replies with incorrect transactions. The newly
    joined node detects that and rejects the transactions and thus blacklists
    the node. Ii thus cannot complete the process till the timeout and then
    requests the missing transactions.
    """
    looper, newNode, client, wallet, _, _ = nodeCreatedAfterSomeTxns
    bad_node = patched_node

    do_not_tell_clients_about_newly_joined_node(txnPoolNodeSet)

    logger.debug('Catchup request processor of {} patched'.format(bad_node))

    looper.run(checkNodesConnected(txnPoolNodeSet))

    # catchup #1 -> CatchupTransactionsTimeout -> catchup #2
    catchup_timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet) + 1)
    timeout = 2 * catchup_timeout + tconf.CatchupTransactionsTimeout

    # have to skip seqno_db check because the txns are not executed
    # on the new node
    waitNodeDataEquality(looper,
                         newNode,
                         *txnPoolNodeSet[:-1],
                         customTimeout=timeout)

    assert newNode.isNodeBlacklisted(bad_node.name)
def add_new_node(looper,
                 nodes,
                 sdk_pool_handle,
                 sdk_wallet_steward,
                 tdir,
                 client_tdir,
                 tconf,
                 all_plugins_path,
                 name=None):
    node_name = name or "Psi"
    new_steward_name = "testClientSteward" + randomString(3)
    _, new_node = sdk_add_new_steward_and_node(looper,
                                               sdk_pool_handle,
                                               sdk_wallet_steward,
                                               new_steward_name,
                                               node_name,
                                               tdir,
                                               tconf,
                                               allPluginsPath=all_plugins_path)
    nodes.append(new_node)
    looper.run(checkNodesConnected(nodes))
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(nodes))
    waitNodeDataEquality(looper, new_node, *nodes[:-1], customTimeout=timeout)
    sdk_pool_refresh(looper, sdk_pool_handle)
    return new_node
Esempio n. 4
0
def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet,
                                            nodes_slow_to_process_catchup_reqs,
                                            nodeCreatedAfterSomeTxns):
    """
    A new node that joins after some transactions should stash new transactions
    until it has caught up
    :return:
    """

    looper, new_node, client, wallet, newStewardClient, newStewardWallet = \
        nodeCreatedAfterSomeTxns
    txnPoolNodeSet.append(new_node)
    old_nodes = txnPoolNodeSet[:-1]
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5)
    chk_commits_prepares_recvd(0, old_nodes, new_node)

    for node in old_nodes:
        node.reset_delays_and_process_delayeds()

    timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \
              catchup_delay + \
              waits.expectedPoolElectionTimeout(len(txnPoolNodeSet))
    ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout)
    waitNodeDataEquality(looper, new_node, *old_nodes)

    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2)

    # Commits and Prepares are received by all old nodes
    with pytest.raises(AssertionError):
        # Since nodes discard 3PC messages for already ordered requests.
        chk_commits_prepares_recvd(0, old_nodes, new_node)
    waitNodeDataEquality(looper, new_node, *old_nodes)
Esempio n. 5
0
def add_new_node(looper,
                 nodes,
                 sdk_pool_handle,
                 sdk_wallet_steward,
                 tdir,
                 tconf,
                 all_plugins_path,
                 name=None,
                 wait_till_added=True):
    node_name = name or "Psi"
    new_steward_name = "testClientSteward" + randomString(3)
    _, new_node = sdk_add_new_steward_and_node(looper,
                                               sdk_pool_handle,
                                               sdk_wallet_steward,
                                               new_steward_name,
                                               node_name,
                                               tdir,
                                               tconf,
                                               allPluginsPath=all_plugins_path,
                                               wait_till_added=wait_till_added)
    if wait_till_added:
        nodes.append(new_node)
        looper.run(checkNodesConnected(nodes))
        timeout = waits.expectedPoolCatchupTime(nodeCount=len(nodes))
        waitNodeDataEquality(
            looper,
            new_node,
            *nodes[:-1],
            customTimeout=timeout,
            exclude_from_check=['check_last_ordered_3pc_backup'])
        sdk_pool_refresh(looper, sdk_pool_handle)
    return new_node
def add_new_node(looper,
                 nodes,
                 sdk_pool_handle,
                 sdk_wallet_steward,
                 tdir,
                 tconf,
                 all_plugins_path,
                 name=None):
    node_name = name or randomString(5)
    new_steward_name = "testClientSteward" + randomString(3)
    new_steward_wallet_handle, new_node = \
        sdk_add_new_steward_and_node(looper,
                                     sdk_pool_handle,
                                     sdk_wallet_steward,
                                     new_steward_name,
                                     node_name,
                                     tdir,
                                     tconf,
                                     all_plugins_path)
    nodes.append(new_node)
    looper.run(checkNodesConnected(nodes, customTimeout=60))
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(nodes))
    waitNodeDataEquality(looper,
                         new_node,
                         *nodes[:-1],
                         customTimeout=timeout,
                         exclude_from_check=['check_last_ordered_3pc_backup'])
    return new_node
def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet,
                                            nodes_slow_to_process_catchup_reqs,
                                            sdk_node_created_after_some_txns):
    """
    A new node that joins after some transactions should stash new transactions
    until it has caught up
    :return:
    """

    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_created_after_some_txns
    txnPoolNodeSet.append(new_node)
    old_nodes = txnPoolNodeSet[:-1]
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, 4)
    chk_commits_prepares_recvd(0, old_nodes, new_node)

    for node in old_nodes:
        node.reset_delays_and_process_delayeds()

    timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \
              catchup_delay + \
              waits.expectedPoolElectionTimeout(len(txnPoolNodeSet))
    ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout)
    waitNodeDataEquality(looper, new_node, *old_nodes,
                         exclude_from_check=['check_last_ordered_3pc_backup'])

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              new_steward_wallet_handle, 2)

    # Commits and Prepares are received by all old nodes
    with pytest.raises(AssertionError):
        # Since nodes discard 3PC messages for already ordered requests.
        chk_commits_prepares_recvd(0, old_nodes, new_node)
    waitNodeDataEquality(looper, new_node, *old_nodes)
def testCatchupDelayedNodes(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns,
                            txnPoolCliNodeReg, tdirWithPoolTxns, tconf,
                            allPluginsPath):
    """
    Node sends catchup request to other nodes for only those sequence numbers
    that other nodes have. Have pool of connected nodes with some transactions
    made and then two more nodes say X and Y will join where Y node will start
    its catchup process after some time. The node starting late, i.e. Y should
    not receive any catchup requests
    :return:
    """
    looper, _, _, _, client, wallet = nodeSetWithNodeAddedAfterSomeTxns
    stewardXName = "testClientStewardX"
    nodeXName = "Zeta"
    stewardYName = "testClientStewardY"
    nodeYName = "Eta"
    stewardZName = "testClientStewardZ"
    nodeZName = "Theta"
    delayX = 45
    delayY = 2
    stewardX, nodeX = addNewStewardAndNode(looper,
                                           client,
                                           stewardXName,
                                           nodeXName,
                                           tdirWithPoolTxns,
                                           tconf,
                                           allPluginsPath,
                                           autoStart=False)
    stewardY, nodeY = addNewStewardAndNode(looper,
                                           client,
                                           stewardYName,
                                           nodeYName,
                                           tdirWithPoolTxns,
                                           tconf,
                                           allPluginsPath,
                                           autoStart=False)
    nodeX.nodeIbStasher.delay(cpDelay(delayX))
    nodeY.nodeIbStasher.delay(cpDelay(delayY))
    looper.add(nodeX)
    looper.add(nodeY)
    txnPoolNodeSet.append(nodeX)
    txnPoolNodeSet.append(nodeY)

    timeout = waits.expectedPoolCatchupTime(
        len(txnPoolNodeSet)) + delayX + delayY
    looper.run(checkNodesConnected(txnPoolNodeSet, customTimeout=timeout))
    logger.debug("Stopping 2 newest nodes, {} and {}".format(
        nodeX.name, nodeY.name))
    nodeX.stop()
    nodeY.stop()
    logger.debug("Sending requests")
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 50)
    logger.debug("Starting the 2 stopped nodes, {} and {}".format(
        nodeX.name, nodeY.name))
    nodeX.start(looper.loop)
    nodeY.start(looper.loop)
    waitNodeDataEquality(looper, nodeX, *txnPoolNodeSet[:5])
    waitNodeDataEquality(looper, nodeY, *txnPoolNodeSet[:5])
    def start_stop_one_node(node_to_restart, pool_of_nodes):
        """

        :param node_to_restart: node, which would be restarted
        :param pool_of_nodes: current pool
        :return: new pool with restarted node
        Node restart procedure consist of:
        1. Calling stop()
        2. Remove from looper and pool
        3. Create new instance of node with the same ha, cliha and node_name
        (also all path to data, keys and etc would be exactly as for stopped node)
        4. Add new instance into looper and pool
        5. Check, that other nodes accepted new instance and all pool has the same data
        """

        remaining_nodes = list(set(pool_of_nodes) - {node_to_restart})
        disconnect_node_and_ensure_disconnected(looper,
                                                pool_of_nodes,
                                                node_to_restart,
                                                stopNode=True)
        looper.removeProdable(node_to_restart)
        ensure_all_nodes_have_same_data(looper,
                                        remaining_nodes,
                                        custom_timeout=tconf.NEW_VIEW_TIMEOUT)
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)
        node_to_restart = start_stopped_node(node_to_restart,
                                             looper,
                                             tconf,
                                             tdir,
                                             allPluginsPath,
                                             delay_instance_change_msgs=True)
        pool_of_nodes = remaining_nodes + [node_to_restart]
        looper.run(checkNodesConnected(pool_of_nodes))
        ensure_all_nodes_have_same_data(
            looper,
            pool_of_nodes,
            custom_timeout=tconf.NEW_VIEW_TIMEOUT,
            exclude_from_check=['check_last_ordered_3pc_backup'])
        timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes))
        looper.run(
            eventually(check_ledger_state,
                       node_to_restart,
                       DOMAIN_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
        looper.run(
            eventually(check_ledger_state,
                       node_to_restart,
                       POOL_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
        looper.run(eventually(catchuped, node_to_restart, timeout=2 * timeout))
        return pool_of_nodes
Esempio n. 10
0
def test_lag_size_for_catchup(looper, chkFreqPatched, reqs_for_checkpoint,
                              txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client):
    """
    Verifies that if the stored own checkpoints have aligned bounds then
    the master replica lag which makes the node perform catch-up is
    Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received
    checkpoints.
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    # The master replica of the slow node stops to receive 3PC-messages
    slow_node.master_replica.threePhaseRouter.extend((
        (PrePrepare, lambda *x, **y: None),
        (Prepare, lambda *x, **y: None),
        (Commit, lambda *x, **y: None),
    ))

    completed_catchups_before_reqs = get_number_of_completed_catchups(
        slow_node)

    # Send requests for the slow node's master replica to get
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints
    # from others
    send_reqs_batches_and_get_suff_replies(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint)

    # Give time for the slow node to catch up if it is going to do it
    looper.runFor(
        waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
        waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    checkNodeDataForInequality(slow_node, *other_nodes)

    # Verify that the slow node has not caught up
    assert get_number_of_completed_catchups(
        slow_node) == completed_catchups_before_reqs

    # Send more requests for the slow node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle, sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper,
                         slow_node,
                         *other_nodes,
                         exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the slow node has caught up
    assert get_number_of_completed_catchups(
        slow_node) > completed_catchups_before_reqs
def test_lag_size_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_client):
    """
    Verifies that if the stored own checkpoints have aligned bounds then
    the master replica lag which makes the node perform catch-up is
    Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received
    checkpoints.
    """
    slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]

    # The master replica of the slow node stops to receive 3PC-messages
    slow_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(slow_node)

    # Send requests for the slow node's master replica to get
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints
    # from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the slow node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    checkNodeDataForInequality(slow_node, *other_nodes)

    # Verify that the slow node has not caught up
    assert get_number_of_completed_catchups(slow_node) == completed_catchups_before_reqs

    # Send more requests for the slow node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, slow_node, *other_nodes)

    # Verify that the slow node has caught up
    assert get_number_of_completed_catchups(slow_node) > completed_catchups_before_reqs
def testNodeRejectingInvalidTxns(looper, sdk_pool_handle, sdk_wallet_client,
                                 tconf, tdir, txnPoolNodeSet, patched_node,
                                 request, sdk_wallet_steward, testNodeClass,
                                 allPluginsPath, do_post_node_creation):
    """
    A newly joined node is catching up and sends catchup requests to other
    nodes but one of the nodes replies with incorrect transactions. The newly
    joined node detects that and rejects the transactions and thus blacklists
    the node. Ii thus cannot complete the process till the timeout and then
    requests the missing transactions.
    """
    txnCount = getValueFromModule(request, "txnCount", 5)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, txnCount)
    new_steward_name = randomString()
    new_node_name = "Epsilon"
    new_steward_wallet_handle, new_node = sdk_add_new_steward_and_node(
        looper,
        sdk_pool_handle,
        sdk_wallet_steward,
        new_steward_name,
        new_node_name,
        tdir,
        tconf,
        nodeClass=testNodeClass,
        allPluginsPath=allPluginsPath,
        autoStart=True,
        do_post_node_creation=do_post_node_creation)
    sdk_pool_refresh(looper, sdk_pool_handle)

    bad_node = patched_node

    do_not_tell_clients_about_newly_joined_node(txnPoolNodeSet)

    logger.debug('Catchup request processor of {} patched'.format(bad_node))

    looper.run(checkNodesConnected(txnPoolNodeSet))

    # catchup #1 -> CatchupTransactionsTimeout -> catchup #2
    catchup_timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet) + 1)
    timeout = 2 * catchup_timeout + tconf.CatchupTransactionsTimeout

    # have to skip seqno_db check because the txns are not executed
    # on the new node
    waitNodeDataEquality(looper,
                         new_node,
                         *txnPoolNodeSet[:-1],
                         customTimeout=timeout)

    assert new_node.isNodeBlacklisted(bad_node.name)
def testNodeRejectingInvalidTxns(looper, sdk_pool_handle, sdk_wallet_client,
                                 tconf, tdir, txnPoolNodeSet, patched_node,
                                 request,
                                 sdk_wallet_steward, testNodeClass, allPluginsPath,
                                 do_post_node_creation):
    """
    A newly joined node is catching up and sends catchup requests to other
    nodes but one of the nodes replies with incorrect transactions. The newly
    joined node detects that and rejects the transactions and thus blacklists
    the node. Ii thus cannot complete the process till the timeout and then
    requests the missing transactions.
    """
    txnCount = getValueFromModule(request, "txnCount", 5)
    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              txnCount)
    new_steward_name = randomString()
    new_node_name = "Epsilon"
    new_steward_wallet_handle, new_node = sdk_add_new_steward_and_node(
        looper, sdk_pool_handle, sdk_wallet_steward,
        new_steward_name, new_node_name, tdir, tconf, nodeClass=testNodeClass,
        allPluginsPath=allPluginsPath, autoStart=True,
        do_post_node_creation=do_post_node_creation)
    sdk_pool_refresh(looper, sdk_pool_handle)

    bad_node = patched_node

    do_not_tell_clients_about_newly_joined_node(txnPoolNodeSet)

    logger.debug('Catchup request processor of {} patched'.format(bad_node))

    looper.run(checkNodesConnected(txnPoolNodeSet))

    # catchup #1 -> CatchupTransactionsTimeout -> catchup #2
    catchup_timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet) + 1)
    timeout = 2 * catchup_timeout + tconf.CatchupTransactionsTimeout

    # have to skip seqno_db check because the txns are not executed
    # on the new node
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1],
                         customTimeout=timeout)

    assert new_node.isNodeBlacklisted(bad_node.name)
def add_new_node(looper, nodes, steward, steward_wallet,
                 tdir, client_tdir, tconf, all_plugins_path, name=None):
    node_name = name or randomString(5)
    new_steward_name = "testClientSteward" + randomString(3)
    new_steward, new_steward_wallet, new_node = addNewStewardAndNode(looper,
                                                                     steward,
                                                                     steward_wallet,
                                                                     new_steward_name,
                                                                     node_name,
                                                                     tdir,
                                                                     client_tdir,
                                                                     tconf,
                                                                     all_plugins_path)
    nodes.append(new_node)
    looper.run(checkNodesConnected(nodes, customTimeout=60))
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(nodes))
    waitNodeDataEquality(looper, new_node, *nodes[:-1],
                         customTimeout=timeout)
    return new_node
def add_new_node(looper, nodes, sdk_pool_handle, sdk_wallet_steward,
                 tdir, tconf, all_plugins_path, name=None):
    node_name = name or randomString(5)
    new_steward_name = "testClientSteward" + randomString(3)
    new_steward_wallet_handle, new_node = \
        sdk_add_new_steward_and_node(looper,
                                     sdk_pool_handle,
                                     sdk_wallet_steward,
                                     new_steward_name,
                                     node_name,
                                     tdir,
                                     tconf,
                                     all_plugins_path)
    nodes.append(new_node)
    looper.run(checkNodesConnected(nodes, customTimeout=60))
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(nodes))
    waitNodeDataEquality(looper, new_node, *nodes[:-1],
                         customTimeout=timeout)
    return new_node
def testPrimaryElectionCase5(case5Setup, looper, txnPoolNodeSet):
    """
    Case 5 - A node making primary declarations for a multiple other nodes.
    Consider 4 nodes A, B, C, and D. Lets say node B is malicious and
    declares node C as primary to all nodes.
    Again node B declares node D as primary to all nodes.
    """
    A, B, C, D = txnPoolNodeSet

    looper.run(checkNodesConnected(txnPoolNodeSet))

    BRep = Replica.generateName(B.name, 0)
    CRep = Replica.generateName(C.name, 0)
    DRep = Replica.generateName(D.name, 0)

    # Node B first sends PRIMARY msgs for Node C to all nodes
    # B.send(Primary(CRep, 0, B.viewNo))
    B.send(primaryByNode(CRep, B, 0))
    # Node B sends PRIMARY msgs for Node D to all nodes
    # B.send(Primary(DRep, 0, B.viewNo))
    B.send(primaryByNode(DRep, B, 0))

    # Ensure elections are done
    # also have to take into account the catchup procedure
    timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) + \
              waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \
              delayOfElectionDone
    ensureElectionsDone(looper=looper,
                        nodes=txnPoolNodeSet,
                        customTimeout=timeout)

    # All nodes from node A, node C, node D(node B is malicious anyway so not
    # considering it) should have primary declarations for node C from node B
    #  since node B first nominated node C
    for node in [A, C, D]:
        logger.debug(
            "node {} should have primary declaration for C from node B".format(
                node))
        assert node.elector.primaryDeclarations[0][BRep][0] == CRep
Esempio n. 17
0
def test_catchup_during_3pc(tconf, looper, txnPoolNodeSet, sdk_wallet_client,
                            sdk_pool_handle):
    '''
    1) Send 1 3PC batch + 2 reqs
    2) Delay commits on one node
    3) Make sure the batch is ordered on all nodes except the lagged one
    4) start catchup of the lagged node
    5) Make sure that all nodes are equal
    6) Send more requests that we have 3 batches in total
    7) Make sure that all nodes are equal
    '''

    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    with delay_rules(lagging_node.nodeIbStasher, cDelay()):
        sdk_reqs = sdk_send_random_requests(looper, sdk_pool_handle,
                                            sdk_wallet_client,
                                            tconf.Max3PCBatchSize + 2)

        looper.run(
            eventually(check_last_ordered_3pc_on_master, rest_nodes, (0, 1)))

        lagging_node.start_catchup()

        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating),
                retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

        waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)

    sdk_get_replies(looper, sdk_reqs)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 2 * tconf.Max3PCBatchSize - 2)

    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testPrimaryElectionCase4(case4Setup, looper):
    """
    Case 4 - A node making multiple primary declarations for a particular node.
    Consider 4 nodes A, B, C and D. Lets say node B is malicious and is
    repeatedly declaring Node D as primary
    """
    allNodes = case4Setup
    A, B, C, D = allNodes

    looper.run(checkNodesConnected(allNodes))

    # Node B sends multiple declarations of node D's 0th protocol instance as
    # primary to all nodes
    for i in range(5):
        # B.send(Primary(D.name, 0, B.viewNo))
        B.send(primaryByNode(D.name, B, 0))

    # No node from node A, node C, node D(node B is malicious anyway so not
    # considering it) should have more than one primary declaration for node
    # D since node D is slow. The one primary declaration for node D,
    # that nodes A, C and D might have would be because of node B
    def x():
        primDecs = [p[0] for p in node.elector.primaryDeclarations[0].values()]
        assert primDecs.count(D.name) <= 1

    # also have to take into account the catchup procedure
    timeout = waits.expectedPoolNominationTimeout(len(allNodes)) + \
              waits.expectedPoolCatchupTime(len(allNodes))

    for node in (A, C, D):
        looper.run(eventually(x, retryWait=.5, timeout=timeout))

    timeout = waits.expectedPoolElectionTimeout(
        len(allNodes)) + delaySelfNomination
    ensureElectionsDone(looper=looper, nodes=allNodes, customTimeout=timeout)

    # Node D should not have any primary replica
    assert not D.hasPrimary
def testPrimaryElectionCase4(case4Setup, looper):
    """
    Case 4 - A node making multiple primary declarations for a particular node.
    Consider 4 nodes A, B, C and D. Lets say node B is malicious and is
    repeatedly declaring Node D as primary
    """
    allNodes = case4Setup
    A, B, C, D = allNodes

    looper.run(checkNodesConnected(allNodes))

    # Node B sends multiple declarations of node D's 0th protocol instance as
    # primary to all nodes
    for i in range(5):
        # B.send(Primary(D.name, 0, B.viewNo))
        B.send(primaryByNode(D.name, B, 0))

    # No node from node A, node C, node D(node B is malicious anyway so not
    # considering it) should have more than one primary declaration for node
    # D since node D is slow. The one primary declaration for node D,
    # that nodes A, C and D might have would be because of node B
    def x():
        primDecs = [p[0] for p in node.elector.primaryDeclarations[0].values()]
        assert primDecs.count(D.name) <= 1

    # also have to take into account the catchup procedure
    timeout = waits.expectedPoolNominationTimeout(len(allNodes)) + \
              waits.expectedPoolCatchupTime(len(allNodes))

    for node in (A, C, D):
        looper.run(eventually(x, retryWait=.5, timeout=timeout))

    timeout = waits.expectedPoolElectionTimeout(
        len(allNodes)) + delaySelfNomination
    ensureElectionsDone(looper=looper, nodes=allNodes, customTimeout=timeout)

    # Node D should not have any primary replica
    assert not D.hasPrimary
def testPrimaryElectionCase5(case5Setup, looper, txnPoolNodeSet):
    """
    Case 5 - A node making primary declarations for a multiple other nodes.
    Consider 4 nodes A, B, C, and D. Lets say node B is malicious and
    declares node C as primary to all nodes.
    Again node B declares node D as primary to all nodes.
    """
    A, B, C, D = txnPoolNodeSet

    looper.run(checkNodesConnected(txnPoolNodeSet))

    BRep = Replica.generateName(B.name, 0)
    CRep = Replica.generateName(C.name, 0)
    DRep = Replica.generateName(D.name, 0)

    # Node B first sends PRIMARY msgs for Node C to all nodes
    # B.send(Primary(CRep, 0, B.viewNo))
    B.send(primaryByNode(CRep, B, 0))
    # Node B sends PRIMARY msgs for Node D to all nodes
    # B.send(Primary(DRep, 0, B.viewNo))
    B.send(primaryByNode(DRep, B, 0))

    # Ensure elections are done
    # also have to take into account the catchup procedure
    timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) + \
              waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \
              delayOfElectionDone
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=timeout)

    # All nodes from node A, node C, node D(node B is malicious anyway so not
    # considering it) should have primary declarations for node C from node B
    #  since node B first nominated node C
    for node in [A, C, D]:
        logger.debug(
            "node {} should have primary declaration for C from node B"
                .format(node))
        assert node.elector.primaryDeclarations[0][BRep][0] == CRep
Esempio n. 21
0
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))

        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = \
            repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(
                repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) -
                process_checkpoint_times_before ==
                (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))

        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.discovering

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
def test_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet,
                           sdk_pool_handle, sdk_wallet_client, allPluginsPath):
    '''
    Tests that requests being ordered during catch-up are stashed and re-applied
    when catch-up is finished
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)
    assert all(replica.stasher.num_stashed_catchup == 0
               for inst_id, replica in lagging_node.replicas.items())

    with delay_rules(lagging_node.nodeIbStasher,
                     cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got catchup replies for messages missed while the node was offline,
        # so that now we can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3),
                       retryWait=1,
                       timeout=60))

        # make sure that more requests are being ordered while catch-up is in progress
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, 10)

        assert lagging_node.mode == Mode.syncing
        assert all(replica.stasher.num_stashed_catchup > 0
                   for inst_id, replica in lagging_node.replicas.items())

    # check that the catch-up is finished
    looper.run(
        eventually(lambda: assertExp(lagging_node.mode == Mode.participating),
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
            initial_all_ledgers_caught_up + 1)))

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
    assert all(replica.stasher.num_stashed_catchup == 0
               for inst_id, replica in lagging_node.replicas.items())
Esempio n. 23
0
def test_checkpoints_after_view_change(tconf, looper, chkFreqPatched,
                                       reqs_for_checkpoint, txnPoolNodeSet,
                                       sdk_pool_handle, sdk_wallet_client):
    '''
    Tests that there is no infinite catchups if there is
    a quorum of stashed checkpoints received during the view change
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)
    initial_start_catchup = lagging_node.spylog.count(Node.start_catchup)

    with delay_rules(lagging_node.nodeIbStasher, lsDelay()):
        with delay_rules(lagging_node.nodeIbStasher, vcd_delay()):
            ensure_view_change(looper, txnPoolNodeSet)
            looper.run(
                eventually(lambda: assertExp(lagging_node.
                                             view_change_in_progress is True),
                           timeout=waits.expectedPoolCatchupTime(
                               len(txnPoolNodeSet))))
            ensureElectionsDone(looper=looper,
                                nodes=rest_nodes,
                                instances_list=range(2))

            assert all(n.view_change_in_progress is False for n in rest_nodes)
            assert lagging_node.view_change_in_progress is True

            # make sure that more requests are being ordered while catch-up is in progress on the lagging node
            # stash enough stable checkpoints for starting a catch-up
            num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
            num_reqs = reqs_for_checkpoint * num_checkpoints + 1
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client, num_reqs)
            looper.run(
                eventually(check_last_ordered_3pc_on_master, rest_nodes,
                           (1, num_reqs + 1)))
            looper.run(
                eventually(check_last_ordered_3pc_on_backup, rest_nodes,
                           (1, num_reqs + 1)))

            # all good nodes stabilized checkpoint
            looper.run(eventually(chkChkpoints, rest_nodes, 2, 0))

            assert get_stashed_checkpoints(
                lagging_node) == num_checkpoints * len(rest_nodes)
            # lagging node is doing the view change and stashing all checkpoints
            assert lagging_node.view_change_in_progress is True
            looper.run(
                eventually(lambda: assertExp(
                    get_stashed_checkpoints(lagging_node) == 2 * len(rest_nodes
                                                                     )),
                           timeout=waits.expectedPoolCatchupTime(
                               len(txnPoolNodeSet))))

    # check that view change is finished
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    assert lagging_node.view_change_in_progress is False

    # check that last_ordered is set
    looper.run(
        eventually(check_last_ordered_3pc_on_master, [lagging_node],
                   (1, num_reqs + 1)))
    looper.run(
        eventually(check_last_ordered_3pc_on_backup, [lagging_node],
                   (1, num_reqs + 1)))

    # check that checkpoint is stabilized for master
    looper.run(eventually(chk_chkpoints_for_instance, [lagging_node], 0, 2, 0))

    # check that the catch-up is finished
    assert lagging_node.mode == Mode.participating
    assert lagging_node.spylog.count(
        Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1
    assert lagging_node.spylog.count(
        Node.start_catchup) == initial_start_catchup + 1

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
Esempio n. 24
0
def test_primary_send_incorrect_pp(looper, txnPoolNodeSet, tconf,
                                   allPluginsPath, sdk_pool_handle,
                                   sdk_wallet_steward, monkeypatch):
    """
    Test steps:
    Delay message requests with PrePrepares on `slow_node`
    Patch sending for PrePrepare on the `malicious_primary` to send an invalid PrePrepare to slow_node
    Order a new request
    Start a view change
    Make sure it's finished on all nodes
    Make sure that the lagging node has same data with other nodes
    """
    start_view_no = txnPoolNodeSet[0].viewNo
    slow_node = txnPoolNodeSet[-1]
    malicious_primary = txnPoolNodeSet[0]
    other_nodes = [
        n for n in txnPoolNodeSet if n not in [slow_node, malicious_primary]
    ]
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(txnPoolNodeSet))
    ensure_all_nodes_have_same_data(looper,
                                    txnPoolNodeSet,
                                    custom_timeout=timeout)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 1)
    old_sender = malicious_primary.master_replica._ordering_service._send

    def patched_sender(msg, dst=None, stat=None):
        if isinstance(msg, PrePrepare) and msg:
            old_sender(msg, [n.name for n in other_nodes], stat)
            pp_dict = msg._asdict()
            pp_dict["ppTime"] += 1
            pp = PrePrepare(**pp_dict)
            old_sender(pp, [slow_node.name], stat)
            monkeypatch.undo()

    monkeypatch.setattr(malicious_primary.master_replica._ordering_service,
                        '_send', patched_sender)
    monkeypatch.setattr(slow_node.master_replica._ordering_service,
                        '_validate_applied_pre_prepare', lambda a, b, c: None)
    with delay_rules(slow_node.nodeIbStasher,
                     msg_rep_delay(types_to_delay=[PREPREPARE])):
        preprepare_process_num = slow_node.master_replica._ordering_service.spylog.count(
            OrderingService.process_preprepare)
        resp_task = sdk_send_random_request(looper, sdk_pool_handle,
                                            sdk_wallet_steward)

        def chk():
            assert preprepare_process_num + 1 == slow_node.master_replica._ordering_service.spylog.count(
                OrderingService.process_preprepare)

        looper.run(eventually(chk))

        _, j_resp = sdk_get_and_check_replies(looper, [resp_task])[0]
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)

        trigger_view_change(txnPoolNodeSet)
        ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
        waitForViewChange(looper,
                          txnPoolNodeSet,
                          expectedViewNo=start_view_no + 1)

        ensureElectionsDone(looper=looper,
                            nodes=txnPoolNodeSet,
                            instances_list=[0, 1])
        ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
        sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                                   sdk_pool_handle)
def test_slow_node_reverts_unordered_state_during_catchup(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 25

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))
    # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the
    # node catch-ups 2 times.
    # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all
    # missing txns, so no stashed ones are applied
    slow_node.nodeIbStasher.delay(lsDelay(1000))

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                             delay_batches * Max3PCBatchSize)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc

    # start view change (and hence catchup)
    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        check_last_ordered_3pc(n1, n2)

    assert slow_master_replica.last_prepared_before_view_change == old_last_ordered

    old_pc_count = slow_master_replica._ordering_service.spylog.count(
        slow_master_replica._ordering_service._validate)

    assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0

    # Repair the network so COMMITs are received, processed and stashed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica._ordering_service.spylog.count(
            slow_master_replica._ordering_service._validate) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed
        assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \
               delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1)

    looper.run(eventually(chk3, retryWait=1, timeout=15))

    # fix catchup, so the node gets a chance to be caught-up
    repair_node_catchup_less(other_nodes)

    def chk4():
        # Some COMMITs were received but stashed and
        # they will processed after catchup
        assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0

    looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50))

    def chk5():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(chk5,
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    # make sure that the pool is functional
    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_catchup_not_triggered_if_another_in_progress(
        looper,
        chkFreqPatched,
        reqs_for_checkpoint,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client,
        broken_node_and_others):
    """
    A node misses 3pc messages and checkpoints during some period but later it
    stashes some amount of checkpoints and starts catchup. When the node is
    performing the catchup, it receives more checkpoints enough to start a new
    catchup but it does not start it because the first catchup is in progress.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize
    broken_node, other_nodes = broken_node_and_others

    logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints")

    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint + max_batch_size)

    waitNodeDataInequality(looper, broken_node, *other_nodes)

    logger.info(
        "Step 2: The node receives 3PC-messages but cannot process them because of "
        "missed ones. But the node eventually stashes some amount of checkpoints "
        "and after that starts catchup")

    repaired_node = repair_broken_node(broken_node)

    initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup)
    initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp)

    with delay_rules(repaired_node.nodeIbStasher, cr_delay()):
        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint - max_batch_size)

        ensure_all_nodes_have_same_data(looper, other_nodes)
        target_ledger_size = other_nodes[0].domainLedger.size

        looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing),
                              timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) +
                                      waits.expectedPoolConsistencyProof(len(txnPoolNodeSet))))
    
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1

        logger.info(
            "Step 3: While doing the catchup, the node receives new checkpoints "
            "enough to start a new catchup but the node does not start it because "
            "the former is in progress")

        process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint)

        send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                               sdk_pool_handle,
                                               sdk_wallet_client,
                                               (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                               reqs_for_checkpoint)

        # Wait until the node receives the new checkpoints from all the other nodes
        looper.run(
            eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) -
                                         process_checkpoint_times_before ==
                                         (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) *
                                         (len(txnPoolNodeSet) - 1)),
                       timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet))))
    
        # New catchup is not started when another one is in progress
        assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
        assert repaired_node.mode == Mode.syncing

    logger.info("Step 4: The node completes the catchup. The ledger has been "
                "updated to the level determined on its start")

    looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating),
                          timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))
    assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1
    assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1
    assert repaired_node.domainLedger.size == target_ledger_size
Esempio n. 27
0
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched,
                                          reqs_for_checkpoint, testNodeClass,
                                          txnPoolNodeSet, sdk_pool_handle,
                                          sdk_wallet_client, allPluginsPath):
    '''
    Tests that 3PC messages and Checkpoints being ordered during catch-up are stashed and re-applied
    when catch-up is finished.
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received.
    '''

    batches_count = get_pp_seq_no(txnPoolNodeSet)
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    batches_count += 1
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher,
                     cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
        looper.add(lagging_node)
        txnPoolNodeSet[-1] = lagging_node
        looper.run(checkNodesConnected(txnPoolNodeSet))

        # wait till we got catchup replies for messages missed while the node was offline,
        # so that now qwe can order more messages, and they will not be caught up, but stashed
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3),
                       retryWait=1,
                       timeout=60))

        assert lagging_node.mode == Mode.syncing

        # make sure that more requests are being ordered while catch-up is in progress
        # stash enough stable checkpoints for starting a catch-up
        num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
        num_reqs = reqs_for_checkpoint * num_checkpoints + 1
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, num_reqs)
        batches_count += num_reqs
        looper.run(
            eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                       (0, batches_count)))

        # all good nodes stabilized checkpoint
        looper.run(
            eventually(check_for_nodes, rest_nodes, check_stable_checkpoint,
                       10))

        # lagging node is catching up and stashing all checkpoints
        assert lagging_node.mode == Mode.syncing
        looper.run(
            eventually(lambda: assertExp(
                get_stashed_checkpoints(lagging_node) == num_checkpoints * len(
                    rest_nodes)),
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

    # check that last_ordered is set
    looper.run(
        eventually(check_last_ordered_3pc_on_all_replicas, [lagging_node],
                   (0, batches_count)))

    # check that checkpoint is stabilized for master
    looper.run(
        eventually(check_for_instance, [lagging_node], 0,
                   check_stable_checkpoint, 10))

    # check that the catch-up is finished
    looper.run(
        eventually(lambda: assertExp(lagging_node.mode == Mode.participating),
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    # check that catch-up was started only once
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
            initial_all_ledgers_caught_up + 1)))
    looper.run(
        eventually(lambda: assertExp(
            lagging_node.spylog.count(Node.start_catchup) == 1)))

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_complete_short_checkpoint_not_included_in_lag_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client,
        tdir, tconf, allPluginsPath):
    """
    Verifies that if the first stored own checkpoint has a not aligned lower
    bound (this means that it was started after a catch-up), is complete
    and there is a quorumed stashed checkpoint from other replicas with
    the same end then this stashed checkpoint is not included into the lag
    for a catch-up, i.e. in such a case the lag which makes the node perform
    catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    received checkpoints.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize

    _, new_node = sdk_add_new_steward_and_node(
        looper, sdk_pool_handle, sdk_wallet_steward,
        'EpsilonSteward', 'Epsilon', tdir, tconf,
        allPluginsPath=allPluginsPath)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
    # Epsilon did not participate in ordering of the batch with EpsilonSteward
    # NYM transaction and the batch with Epsilon NODE transaction.
    # Epsilon got these transactions via catch-up.

    # To complete the first checkpoint send requests for 1 checkpoint minus
    # 2 3PC-batches (since there are already 2 3PC-batches in the first
    # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE
    # transaction). This checkpoint has a not aligned lower bound
    # on the new node replicas so it will not be stabilized on them.
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint - 2 * max_batch_size)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])

    # The master replica of the new node stops to receive 3PC-messages
    new_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(new_node)

    # Send requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the new node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1])

    # Verify that the new node has not caught up
    assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs

    # Send more requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])

    # Verify that the new node has caught up
    assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_6th_node_join_after_view_change_by_master_restart(
        looper, txnPoolNodeSet, tdir, tconf, allPluginsPath, sdk_pool_handle,
        sdk_wallet_steward, limitTestRunningTime):
    """
    Test steps:
    1. start pool of 4 nodes
    2. force 4 view change by restarting primary node
    3. now primary node must be Alpha, then add new node, named Epsilon
    4. ensure, that Epsilon was added and catch-up done
    5. send some txns
    6. force 4 view change. Now primary node is new added Epsilon
    7. add 6th node and ensure, that new node is catchuped
    """
    for __ in range(4):
        ensure_view_change(looper,
                           txnPoolNodeSet,
                           custom_timeout=tconf.VIEW_CHANGE_TIMEOUT)

    ensureElectionsDone(looper,
                        txnPoolNodeSet,
                        customTimeout=tconf.VIEW_CHANGE_TIMEOUT)
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(txnPoolNodeSet))
    for node in txnPoolNodeSet:
        looper.run(eventually(catchuped, node, timeout=2 * timeout))
    ensure_all_nodes_have_same_data(looper,
                                    txnPoolNodeSet,
                                    custom_timeout=timeout)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 5)

    new_epsilon_node = add_new_node(looper,
                                    txnPoolNodeSet,
                                    sdk_pool_handle,
                                    sdk_wallet_steward,
                                    tdir,
                                    tconf,
                                    allPluginsPath,
                                    name='Epsilon')
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 5)
    """
    check that pool and domain ledgers for new node are in synced state
    """
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(txnPoolNodeSet))
    for node in txnPoolNodeSet:
        looper.run(
            eventually(check_ledger_state,
                       node,
                       DOMAIN_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
        looper.run(
            eventually(check_ledger_state,
                       node,
                       POOL_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
    for __ in range(4):
        ensure_view_change(looper,
                           txnPoolNodeSet,
                           custom_timeout=tconf.VIEW_CHANGE_TIMEOUT)

    ensureElectionsDone(looper,
                        txnPoolNodeSet,
                        customTimeout=tconf.VIEW_CHANGE_TIMEOUT)
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(txnPoolNodeSet))
    for node in txnPoolNodeSet:
        looper.run(eventually(catchuped, node, timeout=3 * timeout))
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 2)
    new_psi_node = add_new_node(looper,
                                txnPoolNodeSet,
                                sdk_pool_handle,
                                sdk_wallet_steward,
                                tdir,
                                tconf,
                                allPluginsPath,
                                name='Psi')
    looper.run(
        eventually(check_ledger_state,
                   new_psi_node,
                   DOMAIN_LEDGER_ID,
                   LedgerState.synced,
                   retryWait=.5,
                   timeout=5))
    looper.run(
        eventually(check_ledger_state,
                   new_psi_node,
                   POOL_LEDGER_ID,
                   LedgerState.synced,
                   retryWait=.5,
                   timeout=5))
Esempio n. 30
0
def test_node_requests_missing_preprepares_prepares_and_commits(
        looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle):
    """
    1 of 4 nodes goes down. A new request comes in and is ordered by
    the 3 remaining nodes. After a while the previously disconnected node
    comes back alive. Another request comes in. Check that the previously
    disconnected node requests missing PREPREPARES, PREPARES and COMMITS,
    orders the previous request and all the nodes successfully handles
    the last request.
    """
    INIT_REQS_CNT = 5
    MISSING_REQS_CNT = 4
    REQS_AFTER_RECONNECT_CNT = 1
    disconnected_node = txnPoolNodeSet[3]
    alive_nodes = txnPoolNodeSet[:3]

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, INIT_REQS_CNT)
    init_ledger_size = txnPoolNodeSet[0].domainLedger.size

    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            disconnected_node,
                                            stopNode=False)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, MISSING_REQS_CNT)

    looper.run(
        eventually(check_pp_out_of_sync,
                   alive_nodes, [disconnected_node],
                   retryWait=1,
                   timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet))))

    reconnect_node_and_ensure_connected(looper, txnPoolNodeSet,
                                        disconnected_node)
    # Give time for the reconnected node to catch up if it is going to do it
    looper.runFor(
        waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
        waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    for node in alive_nodes:
        assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT
    # Ensure that the reconnected node has not caught up though
    assert disconnected_node.domainLedger.size == init_ledger_size

    assert disconnected_node.master_replica.spylog.count(
        Replica._request_pre_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(
        Replica._request_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(
        Replica._request_commit) == 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_pre_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_commit) == 0
    doOrderTimesBefore = disconnected_node.master_replica.spylog.count(
        Replica.doOrder)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, REQS_AFTER_RECONNECT_CNT)
    waitNodeDataEquality(looper, disconnected_node, *alive_nodes)

    assert disconnected_node.master_replica.spylog.count(
        Replica._request_pre_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(
        Replica._request_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(
        Replica._request_commit) > 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_pre_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(
        Replica.process_requested_commit) > 0
    doOrderTimesAfter = disconnected_node.master_replica.spylog.count(
        Replica.doOrder)
    # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch
    assert doOrderTimesAfter - doOrderTimesBefore == 2

    for node in txnPoolNodeSet:
        assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT +
                                          REQS_AFTER_RECONNECT_CNT)
def test_complete_short_checkpoint_not_included_in_lag_for_catchup(
        looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet,
        sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client,
        tdir, tconf, allPluginsPath):
    """
    Verifies that if the first stored own checkpoint has a not aligned lower
    bound (this means that it was started after a catch-up), is complete
    and there is a quorumed stashed checkpoint from other replicas with
    the same end then this stashed checkpoint is not included into the lag
    for a catch-up, i.e. in such a case the lag which makes the node perform
    catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    received checkpoints.
    """
    max_batch_size = chkFreqPatched.Max3PCBatchSize

    _, new_node = sdk_add_new_steward_and_node(
        looper, sdk_pool_handle, sdk_wallet_steward,
        'EpsilonSteward', 'Epsilon', tdir, tconf,
        allPluginsPath=allPluginsPath)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])
    # Epsilon did not participate in ordering of the batch with EpsilonSteward
    # NYM transaction and the batch with Epsilon NODE transaction.
    # Epsilon got these transactions via catch-up.

    # To complete the first checkpoint send requests for 1 checkpoint minus
    # 2 3PC-batches (since there are already 2 3PC-batches in the first
    # checkpoint : with EpsilonSteward NYM transaction and with Epsilon NODE
    # transaction). This checkpoint has a not aligned lower bound
    # on the new node replicas so it will not be stabilized on them.
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint - 2 * max_batch_size)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # The master replica of the new node stops to receive 3PC-messages
    new_node.master_replica.threePhaseRouter.extend(
        (
            (PrePrepare, lambda *x, **y: None),
            (Prepare, lambda *x, **y: None),
            (Commit, lambda *x, **y: None),
        )
    )

    completed_catchups_before_reqs = get_number_of_completed_catchups(new_node)

    # Send requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP *
                                           reqs_for_checkpoint)

    # Give time for the new node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the new node has not caught up
    assert get_number_of_completed_catchups(new_node) == completed_catchups_before_reqs

    # Send more requests for the new node's master replica to reach
    # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 2 quorumed stashed
    # checkpoints from others
    send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet,
                                           sdk_pool_handle,
                                           sdk_wallet_client,
                                           reqs_for_checkpoint)

    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])

    # Verify that the new node has caught up
    assert get_number_of_completed_catchups(new_node) > completed_catchups_before_reqs
def test_slow_node_reverts_unordered_state_during_catchup(looper,
                                                          txnPoolNodeSet,
                                                          sdk_pool_handle,
                                                          sdk_wallet_client):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 25

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))
    # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the
    # node catch-ups 2 times.
    # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all
    # missing txns, so no stashed ones are applied
    slow_node.nodeIbStasher.delay(lsDelay(1000))

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                             delay_batches * Max3PCBatchSize)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    # start view change (and hence catchup)
    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        lst_3pc = check_last_ordered_3pc(n1, n2)

    def chk1():
        # `slow_node` has prepared all 3PC messages which
        # `other_nodes` have ordered
        assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc)

    looper.run(eventually(chk1, retryWait=1))

    old_pc_count = slow_master_replica.spylog.count(
        slow_master_replica.can_process_since_view_change_in_progress)

    assert len(slow_node.stashedOrderedReqs) == 0

    # Repair the network so COMMITs are received, processed and stashed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica.spylog.count(
            slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # COMMITs are stashed
        assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize

    looper.run(eventually(chk3, retryWait=1, timeout=15))

    # fix catchup, so the node gets a chance to be caught-up
    repair_node_catchup_less(other_nodes)

    def chk4():
        # Some COMMITs were ordered but stashed and they were processed
        rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
        assert delay_batches in rv

    looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5))

    def chk5():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(
            chk5,
            retryWait=1,
            timeout=waits.expectedPoolCatchupTime(
                len(txnPoolNodeSet))))

    # make sure that the pool is functional
    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
Esempio n. 33
0
def test_3pc_while_catchup_with_chkpoints_only(
        tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint,
        testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        allPluginsPath):
    '''
    Check that catch-up is not started again even if a quorum of stashed checkpoints
    is received during catch-up.
    Assume that only checkpoints and no 3PC messages are received.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    # delay all 3PC messages on the lagged node so that it
    # receives only Checkpoints and catch-up messages

    lagging_node.nodeIbStasher.delay(ppDelay())
    lagging_node.nodeIbStasher.delay(pDelay())
    lagging_node.nodeIbStasher.delay(cDelay())

    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher, lsDelay(),
                         msg_rep_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got ledger statuses for messages missed while the node was offline,
            # so that now we can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(
                    lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >=
                    3),
                           retryWait=1,
                           timeout=60))

            assert lagging_node.mode != Mode.participating

            # make sure that more requests are being ordered while catch-up is in progress
            # stash enough stable checkpoints for starting a catch-up
            num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1
            num_reqs = reqs_for_checkpoint * num_checkpoints + 1
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client, num_reqs)
            looper.run(
                eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes,
                           (0, num_reqs + 2)))

            # all good nodes stabilized checkpoint
            looper.run(eventually(chkChkpoints, rest_nodes, 2, 0))

            assert lagging_node.mode != Mode.participating
            # lagging node is catching up and stashing all checkpoints
            looper.run(
                eventually(lambda: assertExp(
                    get_stashed_checkpoints(lagging_node) == num_checkpoints *
                    len(rest_nodes)),
                           timeout=waits.expectedPoolCatchupTime(
                               len(txnPoolNodeSet))))

        # check that last_ordered is set
        looper.run(
            eventually(check_last_ordered_3pc_on_master, [lagging_node],
                       (0, num_reqs + 2)))

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating),
                retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

        # check that catch-up was started only once
        assert lagging_node.spylog.count(
            Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.spylog.count(Node.allLedgersCaughtUp) ==
                initial_all_ledgers_caught_up + 1)))
        looper.run(
            eventually(lambda: assertExp(
                lagging_node.spylog.count(Node.start_catchup) == 1)))

        # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up
        waitNodeDataEquality(looper,
                             *txnPoolNodeSet,
                             exclude_from_check='check_audit',
                             customTimeout=5)
def test_3pc_while_catchup_with_chkpoints(tdir, tconf,
                                          looper,
                                          chkFreqPatched,
                                          reqs_for_checkpoint,
                                          testNodeClass,
                                          txnPoolNodeSet,
                                          sdk_pool_handle,
                                          sdk_wallet_client,
                                          allPluginsPath):
    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 10)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Send more requests to active nodes
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(lagging_node,
                                      looper,
                                      tconf,
                                      tdir,
                                      allPluginsPath,
                                      start=False,
                                      )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp)
    # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test).
    with delay_rules(lagging_node.nodeIbStasher, cs_delay()):
        with delay_rules(lagging_node.nodeIbStasher, cr_delay()):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # wait till we got catchup replies for messages missed while the node was offline,
            # so that now qwe can order more messages, and they will not be caught up, but stashed
            looper.run(
                eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3), retryWait=1,
                           timeout=60))

            assert lagging_node.mode == Mode.syncing

            # make sure that more requests are being ordered while catch-up is in progress
            # stash enough stable checkpoints for starting a catch-up
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client,
                                      reqs_for_checkpoint * (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1))

            assert lagging_node.mode == Mode.syncing

        # check that the catch-up is finished
        looper.run(
            eventually(
                lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1,
                timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))
            )
        )
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1)
            )
        )
        # check that catch-up was started only once
        looper.run(
            eventually(
                lambda: assertExp(
                    lagging_node.spylog.count(Node.start_catchup) == 1)
            )
        )
        # check that the node was able to order requests stashed during catch-up
        waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
Esempio n. 35
0
def testCatchupDelayedNodes(txnPoolNodeSet,
                            sdk_node_set_with_node_added_after_some_txns,
                            sdk_wallet_steward, txnPoolCliNodeReg,
                            tdirWithPoolTxns, tconf, tdir, allPluginsPath):
    """
    Node sends catchup request to other nodes for only those sequence numbers
    that other nodes have. Have pool of connected nodes with some transactions
    made and then two more nodes say X and Y will join where Y node will start
    its catchup process after some time. The node starting late, i.e. Y should
    not receive any catchup requests
    :return:
    """
    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_set_with_node_added_after_some_txns
    stewardXName = "testClientStewardX"
    nodeXName = "Zeta"
    stewardYName = "testClientStewardY"
    nodeYName = "Eta"
    stewardZName = "testClientStewardZ"
    nodeZName = "Theta"
    delayX = 45
    delayY = 2
    stewardX, nodeX = sdk_add_new_steward_and_node(
        looper,
        sdk_pool_handle,
        sdk_wallet_steward,
        stewardXName,
        nodeXName,
        tdir,
        tconf,
        autoStart=False,
        allPluginsPath=allPluginsPath)

    stewardY, nodeY = sdk_add_new_steward_and_node(
        looper,
        sdk_pool_handle,
        sdk_wallet_steward,
        stewardYName,
        nodeYName,
        tdir,
        tconf,
        autoStart=False,
        allPluginsPath=allPluginsPath)
    nodeX.nodeIbStasher.delay(cpDelay(delayX))
    nodeY.nodeIbStasher.delay(cpDelay(delayY))
    looper.add(nodeX)
    looper.add(nodeY)
    txnPoolNodeSet.append(nodeX)
    txnPoolNodeSet.append(nodeY)

    timeout = waits.expectedPoolCatchupTime(
        len(txnPoolNodeSet)) + delayX + delayY
    looper.run(checkNodesConnected(txnPoolNodeSet, customTimeout=timeout))
    logger.debug("Stopping 2 newest nodes, {} and {}".format(
        nodeX.name, nodeY.name))
    nodeX.stop()
    nodeY.stop()
    logger.debug("Sending requests")
    sdk_pool_refresh(looper, sdk_pool_handle)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 50)
    logger.debug("Starting the 2 stopped nodes, {} and {}".format(
        nodeX.name, nodeY.name))
    nodeX.start(looper.loop)
    nodeY.start(looper.loop)
    waitNodeDataEquality(looper, nodeX, *txnPoolNodeSet[:5])
    waitNodeDataEquality(looper, nodeY, *txnPoolNodeSet[:5])
def test_old_view_pre_prepare_reply_processing(looper, txnPoolNodeSet, tconf,
                                               allPluginsPath, sdk_pool_handle,
                                               sdk_wallet_steward,
                                               monkeypatch):
    """
    Test steps:
    Delay PrePrepares on `slow_node` (without processing)
    Delay receiving of OldViewPrePrepareRequest on all nodes but `malicious_node`
    Patch sending for OldViewPrePrepareReply on the `malicious_node` to send an invalid PrePrepare
    Start a view change
    Make sure it's finished on all nodes excluding `slow_node`
    Make sure that the lagging node received OldViewPrePrepareReply from the malicious node
    Reset delay for OldViewPrePrepareRequest  on other nodes
    Make sure the pool is functional and all nodes have same data
    """
    start_view_no = txnPoolNodeSet[0].viewNo
    slow_node = txnPoolNodeSet[-2]
    malicious_node = txnPoolNodeSet[-1]
    other_nodes = [
        n for n in txnPoolNodeSet if n not in [slow_node, malicious_node]
    ]
    ensureElectionsDone(looper,
                        txnPoolNodeSet,
                        customTimeout=tconf.NEW_VIEW_TIMEOUT)
    timeout = waits.expectedPoolCatchupTime(nodeCount=len(txnPoolNodeSet))
    ensure_all_nodes_have_same_data(looper,
                                    txnPoolNodeSet,
                                    custom_timeout=timeout)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 1)

    with delay_rules_without_processing(
            slow_node.nodeIbStasher, ppDelay(),
            msg_rep_delay(types_to_delay=[PREPREPARE])):
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_steward, 1)
    with delay_rules([n.nodeIbStasher for n in other_nodes],
                     old_view_pp_request_delay()):
        old_sender = malicious_node.master_replica._ordering_service._send

        def patched_sender(msg, dst=None, stat=None):
            if isinstance(msg, OldViewPrePrepareReply) and msg.preprepares:
                pp_dict = msg.preprepares[0]._asdict()
                pp_dict["digest"] = "incorrect_digest"
                pp = PrePrepare(**pp_dict)
                msg.preprepares[0] = pp
                monkeypatch.undo()
            old_sender(msg, dst, stat)

        monkeypatch.setattr(malicious_node.master_replica._ordering_service,
                            '_send', patched_sender)
        monkeypatch.setattr(slow_node.master_replica._ordering_service,
                            '_validate_applied_pre_prepare',
                            lambda a, b, c: None)
        process_old_pp_num = slow_node.master_replica._ordering_service.spylog.count(
            OrderingService.process_old_view_preprepare_reply)

        for n in txnPoolNodeSet:
            n.view_changer.on_master_degradation()

        waitForViewChange(looper,
                          other_nodes + [malicious_node],
                          expectedViewNo=start_view_no + 1)

        ensureElectionsDone(looper=looper,
                            nodes=other_nodes + [malicious_node],
                            instances_list=[0, 1, 2])
        ensure_all_nodes_have_same_data(looper,
                                        nodes=other_nodes + [malicious_node])

        def chk():
            assert process_old_pp_num + 1 == slow_node.master_replica._ordering_service.spylog.count(
                OrderingService.process_old_view_preprepare_reply)

        looper.run(eventually(chk))

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward,
                               sdk_pool_handle)
def testCatchupDelayedNodes(txnPoolNodeSet,
                            sdk_node_set_with_node_added_after_some_txns,
                            sdk_wallet_steward,
                            txnPoolCliNodeReg, tdirWithPoolTxns,
                            tconf, tdir,
                            allPluginsPath):
    """
    Node sends catchup request to other nodes for only those sequence numbers
    that other nodes have. Have pool of connected nodes with some transactions
    made and then two more nodes say X and Y will join where Y node will start
    its catchup process after some time. The node starting late, i.e. Y should
    not receive any catchup requests
    :return:
    """
    looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \
        sdk_node_set_with_node_added_after_some_txns
    stewardXName = "testClientStewardX"
    nodeXName = "Zeta"
    stewardYName = "testClientStewardY"
    nodeYName = "Eta"
    stewardZName = "testClientStewardZ"
    nodeZName = "Theta"
    delayX = 45
    delayY = 2
    stewardX, nodeX = sdk_add_new_steward_and_node(looper,
                                                   sdk_pool_handle,
                                                   sdk_wallet_steward,
                                                   stewardXName,
                                                   nodeXName,
                                                   tdir,
                                                   tconf,
                                                   autoStart=False,
                                                   allPluginsPath=allPluginsPath)

    stewardY, nodeY = sdk_add_new_steward_and_node(looper,
                                                   sdk_pool_handle,
                                                   sdk_wallet_steward,
                                                   stewardYName,
                                                   nodeYName,
                                                   tdir,
                                                   tconf,
                                                   autoStart=False,
                                                   allPluginsPath=allPluginsPath)
    nodeX.nodeIbStasher.delay(cpDelay(delayX))
    nodeY.nodeIbStasher.delay(cpDelay(delayY))
    looper.add(nodeX)
    looper.add(nodeY)
    txnPoolNodeSet.append(nodeX)
    txnPoolNodeSet.append(nodeY)

    timeout = waits.expectedPoolCatchupTime(
        len(txnPoolNodeSet)) + delayX + delayY
    looper.run(checkNodesConnected(txnPoolNodeSet, customTimeout=timeout))
    logger.debug("Stopping 2 newest nodes, {} and {}".format(nodeX.name,
                                                             nodeY.name))
    nodeX.stop()
    nodeY.stop()
    logger.debug("Sending requests")
    sdk_pool_refresh(looper, sdk_pool_handle)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 50)
    logger.debug("Starting the 2 stopped nodes, {} and {}".format(nodeX.name,
                                                                  nodeY.name))
    nodeX.start(looper.loop)
    nodeY.start(looper.loop)
    waitNodeDataEquality(looper, nodeX, *txnPoolNodeSet[:5])
    waitNodeDataEquality(looper, nodeY, *txnPoolNodeSet[:5])
def test_slow_node_reverts_unordered_state_during_catchup(
        looper, txnPoolNodeSet, client1, wallet1, client1Connected):
    """
    Delay COMMITs to a node such that when it needs to catchup, it needs to
    revert some unordered state. Also till this time the node should have
    receive all COMMITs such that it will apply some of the COMMITs (
    for which it has not received txns from catchup).
    For this delay COMMITs by long, do catchup for a little older than the
    state received in LedgerStatus, once catchup completes, reset delays and
    try to process delayed COMMITs, some COMMITs will be rejected but some will
    be processed since catchup was done for older ledger.
    """
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        3 * Max3PCBatchSize)
    nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0)
    slow_node = nprs[-1].node
    other_nodes = [n for n in txnPoolNodeSet if n != slow_node]
    slow_master_replica = slow_node.master_replica

    commit_delay = 150
    catchup_rep_delay = 15

    # Delay COMMITs to one node
    slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0))

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        6 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, other_nodes)
    waitNodeDataInequality(looper, slow_node, *other_nodes)

    # Make the slow node receive txns for a smaller ledger so it still finds
    # the need to catchup
    delay_batches = 2
    make_a_node_catchup_twice(slow_node, other_nodes, DOMAIN_LEDGER_ID,
                              delay_batches * Max3PCBatchSize)

    def is_catchup_needed_count():
        return len(
            getAllReturnVals(slow_node,
                             slow_node.is_catchup_needed,
                             compare_val_to=True))

    old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp)
    old_cn_count = is_catchup_needed_count()

    # `slow_node` is slow to receive CatchupRep, so that it
    # gets a chance to order COMMITs
    slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay))

    ensure_view_change(looper, txnPoolNodeSet)

    # Check last ordered of `other_nodes` is same
    for n1, n2 in combinations(other_nodes, 2):
        lst_3pc = check_last_ordered_3pc(n1, n2)

    def chk1():
        # `slow_node` has prepared all 3PC messages which
        # `other_nodes` have ordered
        assertEquality(slow_master_replica.last_prepared_before_view_change,
                       lst_3pc)

    looper.run(eventually(chk1, retryWait=1))

    old_pc_count = slow_master_replica.spylog.count(
        slow_master_replica.can_process_since_view_change_in_progress)

    # Repair the network so COMMITs are received and processed
    slow_node.reset_delays_and_process_delayeds(COMMIT)

    def chk2():
        # COMMITs are processed for prepared messages
        assert slow_master_replica.spylog.count(
            slow_master_replica.can_process_since_view_change_in_progress
        ) > old_pc_count

    looper.run(eventually(chk2, retryWait=1, timeout=5))

    def chk3():
        # Some COMMITs were ordered but stashed and they were processed
        rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs)
        assert delay_batches in rv

    looper.run(eventually(chk3, retryWait=1, timeout=catchup_rep_delay + 5))

    def chk4():
        # Catchup was done once
        assert slow_node.spylog.count(
            slow_node.allLedgersCaughtUp) > old_lcu_count

    looper.run(
        eventually(chk4,
                   retryWait=1,
                   timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet))))

    def chk5():
        # Once catchup was done, need of other catchup was not found
        assertEquality(is_catchup_needed_count(), old_cn_count)

    looper.run(eventually(chk5, retryWait=1, timeout=5))

    checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                        2 * Max3PCBatchSize)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
Esempio n. 39
0
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass,
                                         txnPoolNodeSet, sdk_pool_handle,
                                         sdk_wallet_client, allPluginsPath,
                                         chkFreqPatched):
    '''
    Test that the lagging_node can process messages from catchup stash after catchup
     and request lost messages from other nodes.
    '''

    # Prepare nodes
    lagging_node = txnPoolNodeSet[-1]
    rest_nodes = txnPoolNodeSet[:-1]

    # Check that requests executed well
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)

    # Stop one node
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Order 2 checkpoints on rest_nodes (2 txns in 2 batches)
    sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                         sdk_pool_handle, sdk_wallet_client,
                                         2 * CHK_FREQ, 2)
    waitNodeDataEquality(looper, *rest_nodes)

    # Restart stopped node and wait for successful catch up
    lagging_node = start_stopped_node(
        lagging_node,
        looper,
        tconf,
        tdir,
        allPluginsPath,
        start=False,
    )

    initial_all_ledgers_caught_up = lagging_node.spylog.count(
        Node.allLedgersCaughtUp)

    with delay_rules(lagging_node.nodeIbStasher, cs_delay(),
                     msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])):
        with delay_rules(lagging_node.nodeIbStasher,
                         cr_delay(ledger_filter=DOMAIN_LEDGER_ID)):
            looper.add(lagging_node)
            txnPoolNodeSet[-1] = lagging_node
            looper.run(checkNodesConnected(txnPoolNodeSet))

            # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches)
            sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                                 sdk_pool_handle,
                                                 sdk_wallet_client,
                                                 2 * CHK_FREQ, 2)

        # Check that firs txn was ordered from stash after first catchup
        looper.run(
            eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[
                1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1),
                       retryWait=1,
                       timeout=waits.expectedPoolCatchupTime(
                           len(txnPoolNodeSet))))

        # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches)
        sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet,
                                             sdk_pool_handle,
                                             sdk_wallet_client, 2 * CHK_FREQ,
                                             2)

    waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    looper.run(
        eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n.
                                     master_last_ordered_3PC
                                     for n in txnPoolNodeSet)))

    # check that catch-up was started only twice
    assert lagging_node.spylog.count(
        Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_node_requests_missing_preprepares_prepares_and_commits(
        looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle):
    """
    1 of 4 nodes goes down. A new request comes in and is ordered by
    the 3 remaining nodes. After a while the previously disconnected node
    comes back alive. Another request comes in. Check that the previously
    disconnected node requests missing PREPREPARES, PREPARES and COMMITS,
    orders the previous request and all the nodes successfully handles
    the last request.
    """
    INIT_REQS_CNT = 5
    MISSING_REQS_CNT = 4
    REQS_AFTER_RECONNECT_CNT = 1
    disconnected_node = txnPoolNodeSet[3]
    alive_nodes = txnPoolNodeSet[:3]

    sdk_send_random_and_check(looper,
                              txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              INIT_REQS_CNT)
    init_ledger_size = txnPoolNodeSet[0].domainLedger.size

    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet,
                                            disconnected_node, stopNode=False)

    sdk_send_random_and_check(looper,
                              txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              MISSING_REQS_CNT)

    looper.run(eventually(check_pp_out_of_sync,
                          alive_nodes,
                          [disconnected_node],
                          retryWait=1,
                          timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet))))

    reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, disconnected_node)
    # Give time for the reconnected node to catch up if it is going to do it
    looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) +
                  waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))

    for node in alive_nodes:
        assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT
    # Ensure that the reconnected node has not caught up though
    assert disconnected_node.domainLedger.size == init_ledger_size

    assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(Replica._request_commit) == 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) == 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) == 0
    doOrderTimesBefore = disconnected_node.master_replica.spylog.count(Replica.doOrder)

    sdk_send_random_and_check(looper,
                              txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              REQS_AFTER_RECONNECT_CNT)
    waitNodeDataEquality(looper, disconnected_node, *alive_nodes)

    assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(Replica._request_commit) > 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) > 0
    assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) > 0
    doOrderTimesAfter = disconnected_node.master_replica.spylog.count(Replica.doOrder)
    # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch
    assert doOrderTimesAfter - doOrderTimesBefore == 2

    for node in txnPoolNodeSet:
        assert node.domainLedger.size == (init_ledger_size +
                                          MISSING_REQS_CNT +
                                          REQS_AFTER_RECONNECT_CNT)