def test_watermarks_after_view_change(tdir, tconf,
                                      looper,
                                      txnPoolNodeSet,
                                      sdk_pool_handle,
                                      sdk_wallet_client):
    """
    Delay commit, checkpoint, InstanceChange and ViewChangeDone messages for lagging_node.
    Start ViewChange.
    Check that ViewChange finished.
    Reset delays.
    Check that lagging_node can order transactions and has same data with other nodes.
    """
    lagging_node = txnPoolNodeSet[-1]
    lagging_node.master_replica.config.LOG_SIZE = LOG_SIZE
    start_view_no = lagging_node.viewNo
    with delay_rules(lagging_node.nodeIbStasher, cDelay(), chk_delay(), icDelay(), nv_delay()):
        trigger_view_change(txnPoolNodeSet)
        waitForViewChange(looper,
                          txnPoolNodeSet[:-1],
                          expectedViewNo=start_view_no + 1,
                          customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet)))
        ensure_all_nodes_have_same_data(looper, txnPoolNodeSet[:-1])
        sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                  sdk_wallet_client, 6)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
Ejemplo n.º 2
0
def testSuspendNode(looper, sdk_pool_handle, sdk_wallet_trustee, nodeSet, tdir,
                    tconf, allPluginsPath):
    """
    Suspend a node and then cancel suspension. Suspend while suspended
    to test that there is no error
    """
    start_view_no = nodeSet[0].viewNo
    new_steward_wallet, new_node = sdk_node_theta_added(
        looper,
        nodeSet,
        tdir,
        tconf,
        sdk_pool_handle,
        sdk_wallet_trustee,
        allPluginsPath,
        node_config_helper_class=NodeConfigHelper,
        testNodeClass=TestNode,
        name="Node-" + randomString(5))
    waitForViewChange(looper=looper,
                      txnPoolNodeSet=nodeSet,
                      expectedViewNo=start_view_no + 1)
    ensureElectionsDone(looper=looper, nodes=nodeSet)

    demote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node)
    _wait_view_change_finish(looper, nodeSet[:-1], start_view_no + 1)

    demote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node)

    promote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node)
    _wait_view_change_finish(looper, nodeSet[:-1], start_view_no + 2)

    promote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node)
def testDoNotSendInstChngMsgIfMasterDoesntSeePerformanceProblem(
        txnPoolNodeSet, looper, ensureView):
    """
    A node that received an INSTANCE_CHANGE message must not send an
    INSTANCE_CHANGE message if it doesn't observe too much difference in
    performance between its replicas.
    """

    curViewNo = ensureView

    # Count sent instance changes of all nodes
    sentInstChanges = {}
    instChngMethodName = ViewChanger.sendInstanceChange.__name__
    for n in txnPoolNodeSet:
        sentInstChanges[n.name] = n.view_changer.spylog.count(
            instChngMethodName)

    # Send an instance change message to all nodes
    icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(
        curViewNo, 0)
    txnPoolNodeSet[0].send(icMsg)

    # Check that that message is discarded.
    waitForViewChange(looper, txnPoolNodeSet)
    # No node should have sent a view change and thus must not have called
    # `sendInstanceChange`
    for n in txnPoolNodeSet:
        assert n.spylog.count(instChngMethodName) == \
               sentInstChanges.get(n.name, 0)
Ejemplo n.º 4
0
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode):
    # emulate catchup by setting non-synced status
    for node in txnPoolNodeSet:
        node.mode = mode

    check_instance_change_count(txnPoolNodeSet, 0)

    # start View Change
    old_view_no = checkViewNoForNodes(txnPoolNodeSet)
    old_meths = do_view_change(txnPoolNodeSet)
    for node in txnPoolNodeSet:
        node.view_changer.sendInstanceChange(old_view_no + 1)

    # make sure View Change is not started
    check_no_view_change(looper, txnPoolNodeSet)
    assert old_view_no == checkViewNoForNodes(txnPoolNodeSet)

    # emulate finishing of catchup by setting Participating status
    revert_do_view_change(txnPoolNodeSet, old_meths)
    for node in txnPoolNodeSet:
        node.mode = Mode.participating

    # make sure that View Change happened
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
Ejemplo n.º 5
0
def test_replica_removing_before_vc_with_primary_disconnected(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf,
        tdir, allPluginsPath, chkFreqPatched, view_change):
    """
    1. Remove replica
    2. Reconnect master primary
    3. Check that nodes and replicas correctly added
    """
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    node = txnPoolNodeSet[0]
    start_replicas_count = node.replicas.num_replicas
    instance_id = start_replicas_count - 1
    node.replicas.remove_replica(instance_id)
    _check_replica_removed(node, start_replicas_count, instance_id)
    assert not node.monitor.isMasterDegraded()
    assert len(node.requests) == 0
    # trigger view change on all nodes
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node)
    txnPoolNodeSet.remove(node)
    looper.removeProdable(node)
    node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath)
    txnPoolNodeSet.append(node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    waitForViewChange(looper,
                      txnPoolNodeSet,
                      expectedViewNo=1,
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    assert start_replicas_count == node.replicas.num_replicas
def testDiscardInstChngMsgFrmPastView(txnPoolNodeSet, looper, ensureView):
    """
    Once a view change is done, any further INSTANCE_CHANGE messages for that
    view must be discarded by the node.
    """

    curViewNo = ensureView

    # Send an instance change for an old instance message to all nodes
    icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(
        curViewNo, 0)
    txnPoolNodeSet[0].send(icMsg)

    # ensure every node but Alpha discards the invalid instance change request
    timeout = waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))

    # Check that that message is discarded.
    looper.run(
        eventually(checkDiscardMsg,
                   txnPoolNodeSet,
                   icMsg,
                   'which is not more than its view no',
                   txnPoolNodeSet[0],
                   timeout=timeout))

    waitForViewChange(looper, txnPoolNodeSet)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode):
    # emulate catchup by setting non-synced status
    for node in txnPoolNodeSet:
        node.mode = mode

    check_instance_change_count(txnPoolNodeSet, 0)

    # start View Change
    old_view_no = checkViewNoForNodes(txnPoolNodeSet)
    old_meths = do_view_change(txnPoolNodeSet)
    for node in txnPoolNodeSet:
        node.view_changer.sendInstanceChange(old_view_no + 1)

    # make sure View Change is not started
    check_no_view_change(looper, txnPoolNodeSet)
    assert old_view_no == checkViewNoForNodes(txnPoolNodeSet)

    # emulate finishing of catchup by setting Participating status
    revert_do_view_change(txnPoolNodeSet, old_meths)
    for node in txnPoolNodeSet:
        node.mode = Mode.participating

    # make sure that View Change happened
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_nodes_make_view_change_only_on_master_suspicious(
        looper, txnPoolNodeSet):
    old_view = txnPoolNodeSet[0].viewNo

    master_primary = txnPoolNodeSet[0].replicas[0]
    backup_primary = txnPoolNodeSet[1].replicas[1]
    assert master_primary.isPrimary is True
    assert backup_primary.isPrimary is True

    master_primary._ordering_service.replica_batch_digest = lambda reqs: 'asd'
    backup_primary._ordering_service.replica_batch_digest = lambda reqs: 'asd'

    non_primary_backup = txnPoolNodeSet[0].replicas[1]
    old_pp = non_primary_backup.spylog.count(non_primary_backup._ordering_service.process_preprepare)

    def pp_processed(replica, old_pp):
        assert replica._ordering_service.spylog.count(replica._ordering_service.process_preprepare) == old_pp + 1

    backup_primary._ordering_service._do_send_3pc_batch(DOMAIN_LEDGER_ID)
    looper.run(eventually(pp_processed, non_primary_backup, old_pp))
    assert all(node.view_changer.spylog.count(ViewChanger.sendInstanceChange) == 0
               for node in txnPoolNodeSet)
    waitForViewChange(looper, txnPoolNodeSet, old_view)

    non_primary_master = txnPoolNodeSet[1].replicas[0]
    old_pp = non_primary_master.spylog.count(non_primary_master._ordering_service.process_preprepare)

    master_primary._ordering_service._do_send_3pc_batch(DOMAIN_LEDGER_ID)
    looper.run(eventually(pp_processed, non_primary_master, old_pp))
    waitForViewChange(looper, txnPoolNodeSet, old_view + 1)
Ejemplo n.º 9
0
def test_removed_replica_restored_on_view_change(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        tconf, tdir, allPluginsPath, chkFreqPatched, view_change):
    """
    1. Remove replica on some node which is not master primary
    2. Reconnect the node which was master primary so far
    3. Check that nodes and replicas correctly added
    """
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    node = get_last_master_non_primary_node(txnPoolNodeSet)
    start_replicas_count = node.replicas.num_replicas
    instance_id = start_replicas_count - 1

    node.replicas.remove_replica(instance_id)
    check_replica_removed(node, start_replicas_count, instance_id)

    # trigger view change on all nodes
    master_primary = get_master_primary_node(txnPoolNodeSet)
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary)
    txnPoolNodeSet.remove(master_primary)
    looper.removeProdable(master_primary)
    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath)
    txnPoolNodeSet.append(restarted_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))

    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1,
                      customTimeout=2 * tconf.NEW_VIEW_TIMEOUT)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)

    assert start_replicas_count == node.replicas.num_replicas
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1,
                                               client1, client1Connected,
                                               tconf):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    viewNoBefore = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    stopNodes([old_pr_node], looper)
    looper.removeProdable(old_pr_node)
    remainingNodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remainingNodes, viewNoBefore + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remainingNodes)
    new_pr_node = get_master_primary_node(remainingNodes)
    assert old_pr_node != new_pr_node

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode):
    # emulate catchup by setting non-synced status
    for node in txnPoolNodeSet:
        node.mode = mode

    check_stashed_instance_changes(txnPoolNodeSet, 0)

    # start View Change
    old_view_no = checkViewNoForNodes(txnPoolNodeSet)
    old_meths = do_view_change(txnPoolNodeSet)
    for node in txnPoolNodeSet:
        vct_service = node.master_replica._view_change_trigger_service
        vct_service._send_instance_change(old_view_no + 1, Suspicions.PRIMARY_DEGRADED)

    # make sure View Change is not started
    check_no_view_change(looper, txnPoolNodeSet)
    assert old_view_no == checkViewNoForNodes(txnPoolNodeSet)

    # emulate finishing of catchup by setting Participating status
    revert_do_view_change(txnPoolNodeSet, old_meths)
    for node in txnPoolNodeSet:
        node.mode = Mode.participating
        node.master_replica.stasher.process_all_stashed(STASH_CATCH_UP)

    # make sure that View Change happened
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_removed_replica_restored_on_view_change(
        looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client,
        tconf, tdir, allPluginsPath, chkFreqPatched, view_change):
    """
    1. Remove replica on some node which is not master primary
    2. Reconnect the node which was master primary so far
    3. Check that nodes and replicas correctly added
    """
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    node = get_last_master_non_primary_node(txnPoolNodeSet)
    start_replicas_count = node.replicas.num_replicas
    instance_id = start_replicas_count - 1

    node.replicas.remove_replica(instance_id)
    check_replica_removed(node, start_replicas_count, instance_id)

    # trigger view change on all nodes
    master_primary = get_master_primary_node(txnPoolNodeSet)
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary)
    txnPoolNodeSet.remove(master_primary)
    looper.removeProdable(master_primary)
    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath)
    txnPoolNodeSet.append(restarted_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))

    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1,
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)

    assert start_replicas_count == node.replicas.num_replicas
def test_demote_promote_restart_after_promotion_7_nodes(txnPoolNodeSet,
                                                        looper,
                                                        sdk_pool_handle,
                                                        sdk_wallet_steward,
                                                        tdir,
                                                        tconf,
                                                        allPluginsPath):
    demoted_node = txnPoolNodeSet[-1]
    rest_nodes = [n for n in txnPoolNodeSet if n != demoted_node]

    starting_view_no = checkViewNoForNodes(txnPoolNodeSet)

    demote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node)

    waitForViewChange(looper, rest_nodes, expectedViewNo=starting_view_no + 1)
    ensureElectionsDone(looper, rest_nodes)
    ensure_all_nodes_have_same_data(looper, rest_nodes)

    sdk_send_random_and_check(looper, rest_nodes, sdk_pool_handle, sdk_wallet_steward, 5)

    starting_view_no = checkViewNoForNodes(rest_nodes)
    promote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node)

    waitForViewChange(looper, rest_nodes, expectedViewNo=starting_view_no + 1)
    ensureElectionsDone(looper, rest_nodes, instances_list=[0, 1, 2])
    ensure_all_nodes_have_same_data(looper, rest_nodes)

    restart_node(looper, txnPoolNodeSet, demoted_node, tconf, tdir, allPluginsPath)
    ensureElectionsDone(looper, txnPoolNodeSet)

    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode):
    # the last node is a lagging one, which will receive ViewChangeDone messages for future view
    viewNo = checkViewNoForNodes(txnPoolNodeSet)
    lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet)
    lagged_node = txnPoolNodeSet[lagged_node_index]
    other_nodes = list(set(txnPoolNodeSet) - {lagged_node})

    # emulate catchup by setting non-synced status
    lagged_node.mode = mode
    old_view_no = checkViewNoForNodes([lagged_node])

    check_future_vcd_count(lagged_node, 0)

    # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change
    with delay_rules(lagged_node.nodeIbStasher, icDelay()):
        # make sure that View Change happened on all nodes but the lagging one
        ensure_view_change(looper, other_nodes)
        checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2))
        ensure_all_nodes_have_same_data(looper, nodes=other_nodes)

        check_no_view_change(looper, lagged_node)
        assert old_view_no == checkViewNoForNodes([lagged_node])

        # emulate finishing of catchup by setting Participating status
        lagged_node.mode = Mode.participating

        # make sure that View Change happened on lagging node
        waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1,
                          customTimeout=10)
        ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def testDoNotSendInstChngMsgIfMasterDoesntSeePerformanceProblem(
        txnPoolNodeSet, looper, ensureView):
    """
    A node that received an INSTANCE_CHANGE message must not send an
    INSTANCE_CHANGE message if it doesn't observe too much difference in
    performance between its replicas.
    """

    curViewNo = ensureView

    # Count sent instance changes of all nodes
    sentInstChanges = {}
    instChngMethodName = ViewChanger.sendInstanceChange.__name__
    for n in txnPoolNodeSet:
        sentInstChanges[n.name] = n.view_changer.spylog.count(instChngMethodName)

    # Send an instance change message to all nodes
    icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(curViewNo, 0)
    txnPoolNodeSet[0].send(icMsg)

    # Check that that message is discarded.
    waitForViewChange(looper, txnPoolNodeSet)
    # No node should have sent a view change and thus must not have called
    # `sendInstanceChange`
    for n in txnPoolNodeSet:
        assert n.spylog.count(instChngMethodName) == \
               sentInstChanges.get(n.name, 0)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper,
                                                       mode):
    # the last node is a lagging one, which will receive ViewChangeDone messages for future view
    viewNo = checkViewNoForNodes(txnPoolNodeSet)
    lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet)
    lagged_node = txnPoolNodeSet[lagged_node_index]
    other_nodes = list(set(txnPoolNodeSet) - {lagged_node})

    # emulate catchup by setting non-synced status
    lagged_node.mode = mode
    old_view_no = checkViewNoForNodes([lagged_node])

    check_future_vcd_count(lagged_node, 0)

    # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change
    with delay_rules(lagged_node.nodeIbStasher, icDelay()):
        # make sure that View Change happened on all nodes but the lagging one
        ensure_view_change(looper, other_nodes)
        checkProtocolInstanceSetup(looper=looper,
                                   nodes=other_nodes,
                                   numInstances=2)
        ensure_all_nodes_have_same_data(looper, nodes=other_nodes)

        check_no_view_change(looper, lagged_node)
        assert old_view_no == checkViewNoForNodes([lagged_node])

        # emulate finishing of catchup by setting Participating status
        lagged_node.mode = Mode.participating

        # make sure that View Change happened on lagging node
        waitForViewChange(looper, [lagged_node],
                          expectedViewNo=old_view_no + 1,
                          customTimeout=10)
        ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
Ejemplo n.º 17
0
def test_recover_stop_primaries(looper, checkpoint_size, txnPoolNodeSet,
                                allPluginsPath, tdir, tconf, client1, wallet1,
                                client1Connected):
    """
    Test that we can recover after having more than f nodes disconnected:
    - stop current master primary (Alpha)
    - send txns
    - restart current master primary (Beta)
    - send txns
    """

    active_nodes = list(txnPoolNodeSet)
    assert 4 == len(active_nodes)
    initial_view_no = active_nodes[0].viewNo

    logger.info("Stop first node (current Primary)")
    _, active_nodes = stop_primary(looper, active_nodes)

    logger.info("Make sure view changed")
    expected_view_no = initial_view_no + 1
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)
    ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("send at least one checkpoint")
    assert nodes_do_not_have_checkpoints(*active_nodes)
    sendReqsToNodesAndVerifySuffReplies(looper,
                                        wallet1,
                                        client1,
                                        numReqs=2 * checkpoint_size)
    assert nodes_have_checkpoints(*active_nodes)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info(
        "Stop second node (current Primary) so the primary looses his state")
    stopped_node, active_nodes = stop_primary(looper, active_nodes)

    logger.info("Restart the primary node")
    restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir,
                                        allPluginsPath)
    assert nodes_do_not_have_checkpoints(restarted_node)
    assert nodes_have_checkpoints(*active_nodes)
    active_nodes = active_nodes + [restarted_node]

    logger.info("Check that primary selected")
    ensureElectionsDone(looper=looper,
                        nodes=active_nodes,
                        numInstances=2,
                        customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("Check if the pool is able to process requests")
    sendReqsToNodesAndVerifySuffReplies(looper,
                                        wallet1,
                                        client1,
                                        numReqs=10 * checkpoint_size)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)
    assert nodes_have_checkpoints(*active_nodes)
Ejemplo n.º 18
0
def testInstChangeWithMoreReqLat(looper, setup):
    nodes = setup.nodes
    for node in nodes:
        node.checkPerformance()
        assert any(getAllReturnVals(node.monitor,
                                    node.monitor.isMasterReqLatencyTooHigh))

    waitForViewChange(looper, nodes)
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath,
                                     tdir, tconf, sdk_pool_handle,
                                     sdk_wallet_client):
    """
    Check that quorum f + 1 is used for primary selection
    when initiated by CurrentState messages.

    Assumes that view change quorum is n - f.
    Assumes that primaries selection in round robin fashion.
    """

    # Ensure that we have 4 nodes in total
    all_nodes = list(txnPoolNodeSet)
    assert 4 == len(all_nodes)
    alpha, beta, delta, gamma = all_nodes
    initial_view_no = alpha.viewNo

    # Make one node lagging by switching it off for some time
    lagging_node = gamma
    non_lagging_nodes = [alpha, beta, delta]
    disconnect_node_and_ensure_disconnected(looper,
                                            all_nodes,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Make nodes to perform view change
    ensure_view_change(looper, non_lagging_nodes)
    ensureElectionsDone(looper=looper,
                        nodes=non_lagging_nodes,
                        instances_list=range(2))
    ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes)

    # Stop two more of active nodes
    # (but not primary, which is Beta (because of round robin selection))
    stopped_nodes = [alpha]  # TODO: add one more here
    for stopped_node in stopped_nodes:
        disconnect_node_and_ensure_disconnected(looper,
                                                txnPoolNodeSet,
                                                stopped_node,
                                                stopNode=True)
        looper.removeProdable(stopped_node)

    # Start lagging node back
    restarted_node = start_stopped_node(lagging_node, looper, tconf, tdir,
                                        allPluginsPath)
    active_nodes = [beta, delta, restarted_node]

    # Check that primary selected
    expected_view_no = initial_view_no + 1
    ensureElectionsDone(looper=looper,
                        nodes=active_nodes,
                        instances_list=range(2),
                        customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 1)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper,
                                                     sdk_pool_handle,
                                                     sdk_wallet_client, tdir,
                                                     tconf, allPluginsPath):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    old_view_no = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            old_pr_node,
                                            stopNode=True)
    looper.removeProdable(old_pr_node)

    remaining_nodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remaining_nodes, old_view_no + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
    new_pr_node = get_master_primary_node(remaining_nodes)
    assert old_pr_node != new_pr_node

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 5)

    # Check if old primary can join the pool and still functions
    old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir,
                                     allPluginsPath)

    txnPoolNodeSet = remaining_nodes + [old_pr_node]
    looper.run(
        eventually(checkViewNoForNodes,
                   txnPoolNodeSet,
                   old_view_no + 1,
                   timeout=tconf.VIEW_CHANGE_TIMEOUT))

    # After node catches up it set view_no from audit ledger and do not need to do view_change
    assert len(
        getAllReturnVals(old_pr_node.view_changer,
                         old_pr_node.view_changer.start_view_change,
                         compare_val_to=True)) == 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    assert not old_pr_node.view_changer._next_view_indications
Ejemplo n.º 21
0
def do_view_change_with_propagate_primary_on_one_delayed_node(
        slow_node, nodes, looper, sdk_pool_handle, sdk_wallet_client):

    slow_stasher = slow_node.nodeIbStasher

    fast_nodes = [n for n in nodes if n != slow_node]

    stashers = [n.nodeIbStasher for n in nodes]

    # Get last prepared certificate in pool
    lpc = last_prepared_certificate(nodes)
    # Get pool current view no
    view_no = lpc[0]

    with delay_rules(slow_stasher, icDelay()):
        with delay_rules(slow_stasher, vcd_delay()):
            with delay_rules(stashers, cDelay()):
                # Send request
                request = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client)

                # Wait until this request is prepared on N-f nodes
                looper.run(eventually(check_last_prepared_certificate_on_quorum, nodes, (lpc[0], lpc[1] + 1)))

                # Trigger view change
                for n in nodes:
                    n.view_changer.on_master_degradation()

                # Wait until view change is completed on all nodes except slow one
                waitForViewChange(looper,
                                  fast_nodes,
                                  expectedViewNo=view_no + 1,
                                  customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes)))
                wait_for_elections_done_on_given_nodes(looper,
                                                       fast_nodes,
                                                       getRequiredInstances(len(nodes)),
                                                       timeout=waits.expectedPoolElectionTimeout(len(nodes)))

            # Now all the nodes receive Commits
            # The slow node will accept Commits and order the 3PC-batch in the old view
            looper.runFor(waits.expectedOrderingTime(getNoInstances(len(nodes))))

        # Now slow node receives ViewChangeDones
        waitForViewChange(looper,
                          [slow_node],
                          expectedViewNo=view_no + 1,
                          customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes)))
        wait_for_elections_done_on_given_nodes(looper,
                                               [slow_node],
                                               getRequiredInstances(len(nodes)),
                                               timeout=waits.expectedPoolElectionTimeout(len(nodes)))

    # Now slow node receives InstanceChanges but discards them because already
    # started propagate primary to the same view.

    # Finish request gracefully
    sdk_get_reply(looper, request)
Ejemplo n.º 22
0
def do_view_change_with_propagate_primary_on_one_delayed_node(
        slow_node, nodes, looper, sdk_pool_handle, sdk_wallet_client):

    slow_stasher = slow_node.nodeIbStasher

    fast_nodes = [n for n in nodes if n != slow_node]

    stashers = [n.nodeIbStasher for n in nodes]

    # Get last prepared certificate in pool
    lpc = last_prepared_certificate(nodes)
    # Get pool current view no
    view_no = lpc[0]

    with delay_rules(slow_stasher, icDelay()):
        with delay_rules(slow_stasher, vcd_delay()):
            with delay_rules(stashers, cDelay()):
                # Send request
                request = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client)

                # Wait until this request is prepared on N-f nodes
                looper.run(eventually(check_last_prepared_certificate_on_quorum, nodes, (lpc[0], lpc[1] + 1)))

                # Trigger view change
                for n in nodes:
                    n.view_changer.on_master_degradation()

                # Wait until view change is completed on all nodes except slow one
                waitForViewChange(looper,
                                  fast_nodes,
                                  expectedViewNo=view_no + 1,
                                  customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes)))
                wait_for_elections_done_on_given_nodes(looper,
                                                       fast_nodes,
                                                       getRequiredInstances(len(nodes)),
                                                       timeout=waits.expectedPoolElectionTimeout(len(nodes)))

            # Now all the nodes receive Commits
            # The slow node will accept Commits and order the 3PC-batch in the old view
            looper.runFor(waits.expectedOrderingTime(getNoInstances(len(nodes))))

        # Now slow node receives ViewChangeDones
        waitForViewChange(looper,
                          [slow_node],
                          expectedViewNo=view_no + 1,
                          customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes)))
        wait_for_elections_done_on_given_nodes(looper,
                                               [slow_node],
                                               getRequiredInstances(len(nodes)),
                                               timeout=waits.expectedPoolElectionTimeout(len(nodes)))

    # Now slow node receives InstanceChanges but discards them because already
    # started propagate primary to the same view.

    # Finish request gracefully
    sdk_get_reply(looper, request)
Ejemplo n.º 23
0
def test_disable_view_change(disable_view_change_config, looper, nodeSet, up,
                             viewNo, wallet1, client1):
    assert disable_view_change_config
    assert isinstance(disable_view_change_config.unsafe, set)
    assert 'disable_view_change' in disable_view_change_config.unsafe

    simulate_slow_master(looper, nodeSet, wallet1, client1)

    with pytest.raises(AssertionError):
        waitForViewChange(looper, nodeSet, expectedViewNo=viewNo + 1)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper,
                                                  sdk_pool_handle,
                                                  sdk_wallet_steward, viewNo):
    """
    Node will change view even though it does not find the master to be degraded
    when a quorum of nodes agree that master performance degraded
    """

    m_primary_node = get_master_primary_node(list(txnPoolNodeSet))

    # Delay processing of PRE-PREPARE from all non primary replicas of master
    # so master's performance falls and view changes
    delayNonPrimaries(txnPoolNodeSet, 0, 10)

    pr = getPrimaryReplica(txnPoolNodeSet, 0)
    relucatantNode = pr.node

    # Count sent instance changes of all nodes
    sentInstChanges = {}
    for n in txnPoolNodeSet:
        sentInstChanges[n.name] = node_sent_instance_changes_count(n)

    # Node reluctant to change view, never says master is degraded
    relucatantNode.monitor.isMasterDegraded = types.MethodType(
        lambda x: False, relucatantNode.monitor)

    backup_replica = txnPoolNodeSet[0].replicas[1]
    backup_last_ordered_before = backup_replica.last_ordered_3pc
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 4)
    # make sure that backups also ordered at least 1 batch to be able to track performance degradation
    looper.run(
        eventually(lambda: assertExp(backup_replica.last_ordered_3pc >
                                     backup_last_ordered_before)))

    for n in txnPoolNodeSet:
        n.checkPerformance()

    # Check that view change happened for all nodes
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)

    # All nodes except the reluctant node should have sent a view change and
    # thus must have called `sendInstanceChange`
    for n in txnPoolNodeSet:
        if n.name != relucatantNode.name:
            assert node_sent_instance_changes_count(n) > sentInstChanges.get(
                n.name, 0)
        else:
            assert node_sent_instance_changes_count(n) == sentInstChanges.get(
                n.name, 0)

    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet))
    assert m_primary_node.name != new_m_primary_node.name
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1,
                                               client1, client1Connected,
                                               tconf, tdirWithPoolTxns,
                                               allPluginsPath):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    old_view_no = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            old_pr_node,
                                            stopNode=True)
    looper.removeProdable(old_pr_node)

    remaining_nodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remaining_nodes, old_view_no + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
    new_pr_node = get_master_primary_node(remaining_nodes)
    assert old_pr_node != new_pr_node

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)

    # Check if old primary can join the pool and still functions
    old_pr_node = start_stopped_node(old_pr_node, looper, tconf,
                                     tdirWithPoolTxns, allPluginsPath)

    txnPoolNodeSet = remaining_nodes + [old_pr_node]
    looper.run(
        eventually(checkViewNoForNodes,
                   txnPoolNodeSet,
                   old_view_no + 1,
                   timeout=10))
    assert len(
        getAllReturnVals(old_pr_node,
                         old_pr_node._start_view_change_if_possible,
                         compare_val_to=True)) > 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    assert not old_pr_node._next_view_indications
def testInstChangeWithMoreReqLat(looper, setup):
    nodes = setup.nodes
    old_view_nos = set([n.viewNo for n in nodes])
    assert len(old_view_nos) == 1
    old_view_no = old_view_nos.pop()
    for node in nodes:
        node.checkPerformance()
        assert any(getAllReturnVals(node.monitor,
                                    node.monitor.isMasterReqLatencyTooHigh))

    waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
Ejemplo n.º 27
0
def test_quorum_after_f_plus_2_nodes_including_primary_turned_off_and_later_on(
        looper, allPluginsPath, tdir, tconf,
        txnPoolNodeSet, wallet1, client1):
    nodes = txnPoolNodeSet

    request1 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request1])

    stop_node(nodes[0], looper, nodes)
    waitForViewChange(looper, nodes[1:], expectedViewNo=1)
    ensureElectionsDone(looper, nodes[1:],
                        numInstances=getRequiredInstances(nodeCount))

    request2 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request2])

    stop_node(nodes[1], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[2:], expectedViewNo=1)

    request3 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request3, looper, client1, nodes)

    stop_node(nodes[2], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[3:], expectedViewNo=1)

    request4 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request4, looper, client1, nodes)

    nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath)
    looper.runFor(waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[3:], expectedViewNo=1)

    request5 = sendRandomRequest(wallet1, client1)
    verify_request_not_replied_and_not_ordered(request5, looper, client1, nodes)

    nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir, allPluginsPath)
    ensureElectionsDone(looper, nodes[1:],
                        numInstances=getRequiredInstances(nodeCount))
    checkViewNoForNodes(nodes[1:], expectedViewNo=1)

    request6 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request6])

    nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir, allPluginsPath)
    ensureElectionsDone(looper, nodes,
                        numInstances=getRequiredInstances(nodeCount))
    checkViewNoForNodes(nodes, expectedViewNo=1)

    request7 = sendRandomRequest(wallet1, client1)
    waitForSufficientRepliesForRequests(looper, client1, requests=[request7])
def testInstChangeWithMoreReqLat(looper, setup):
    # TODO for now, view_change procedure can take more that 15 minutes
    # (5 minutes for catchup and 10 minutes for primary's answer).
    # Therefore, view_change triggering by max latency now is not indicative.
    nodes = setup.nodes
    old_view_no = setup.old_view_no
    for node in nodes:
        node.checkPerformance()
        assert any(getAllReturnVals(node.monitor,
                                    node.monitor.isMasterReqLatencyTooHigh))

    waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def test_disable_view_change(disable_view_change_config, looper,
                             txnPoolNodeSet, viewNo, sdk_pool_handle,
                             sdk_wallet_steward):
    assert disable_view_change_config
    assert isinstance(disable_view_change_config.unsafe, set)
    assert 'disable_view_change' in disable_view_change_config.unsafe

    simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle,
                         sdk_wallet_steward)

    with pytest.raises(AssertionError):
        waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)
Ejemplo n.º 30
0
    def demote_another_one(rest_pool):
        demoted_node = rest_pool[-1]
        rest_pool = [n for n in rest_pool if n != demoted_node]

        starting_view_no = checkViewNoForNodes(rest_pool)

        demote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node)

        waitForViewChange(looper, rest_pool, expectedViewNo=starting_view_no + 1)
        ensureElectionsDone(looper, rest_pool, customTimeout=60)
        ensure_all_nodes_have_same_data(looper, rest_pool)
        return rest_pool
def test_recover_stop_primaries_no_view_change(looper, checkpoint_size,
                                               txnPoolNodeSet, allPluginsPath,
                                               tdir, tconf, sdk_pool_handle,
                                               sdk_wallet_steward):
    """
    Test that we can recover after having more than f nodes disconnected:
    - send txns
    - stop current master primary
    - restart current master primary
    - send txns
    """

    active_nodes = list(txnPoolNodeSet)
    assert 4 == len(active_nodes)
    initial_view_no = active_nodes[0].viewNo

    logger.info("send at least one checkpoint")
    assert nodes_do_not_have_checkpoints(*active_nodes)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 2 * checkpoint_size)
    assert nodes_have_checkpoints(*active_nodes)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("Stop first node (current Primary)")
    stopped_node, active_nodes = stop_primary(looper, active_nodes)

    logger.info("Restart the primary node")
    restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir,
                                        allPluginsPath)
    assert nodes_do_not_have_checkpoints(restarted_node)
    assert nodes_have_checkpoints(*active_nodes)
    active_nodes = active_nodes + [restarted_node]

    logger.info("Check that primary selected")
    ensureElectionsDone(looper=looper,
                        nodes=active_nodes,
                        instances_list=range(2),
                        customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=0)
    ensure_all_nodes_have_same_data(
        looper,
        nodes=active_nodes,
        exclude_from_check=['check_last_ordered_3pc_backup'])

    logger.info("Check if the pool is able to process requests")
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 10 * checkpoint_size)
    ensure_all_nodes_have_same_data(
        looper,
        nodes=active_nodes,
        exclude_from_check=['check_last_ordered_3pc_backup'])
    assert nodes_have_checkpoints(*active_nodes)
def test_process_three_phase_msg_and_stashed_future_view(
        txnPoolNodeSet, looper, tconf, sdk_pool_handle, sdk_wallet_steward):
    """
    1. Delay ViewChangeDone messages for the slow_node.
    2. Start view change on all nodes.
    3. Order a new request.
    4. Check that slow_node could not order this request and stashed all 3pc messages
    and other nodes ordered.
    6. Reset delays.
    7. Check that the last request is ordered on the slow_node and stashed messages were removed.
    """
    slow_node = txnPoolNodeSet[-1]
    fast_nodes = txnPoolNodeSet[:-1]
    view_no = slow_node.viewNo
    old_stashed = {
        inst_id: r.stasher.stash_size(STASH_VIEW_3PC)
        for inst_id, r in slow_node.replicas.items()
    }
    with delay_rules([
            slow_node.nodeIbStasher,
    ], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])):
        with delay_rules([
                slow_node.nodeIbStasher,
        ], nv_delay()):
            for n in txnPoolNodeSet:
                n.view_changer.on_master_degradation()
            waitForViewChange(looper,
                              fast_nodes,
                              expectedViewNo=view_no + 1,
                              customTimeout=2 * tconf.NEW_VIEW_TIMEOUT)
            ensureElectionsDone(looper=looper,
                                nodes=fast_nodes,
                                instances_list=range(
                                    fast_nodes[0].requiredNumberOfInstances))
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_steward, 1)
            assert slow_node.view_change_in_progress
            # 1 - pre-prepare msg
            # (len(txnPoolNodeSet) - 2) - prepare msgs
            # (len(txnPoolNodeSet) - 1) - commit msgs
            stashed_master_messages = 2 * (1 + (len(txnPoolNodeSet) - 2) +
                                           (len(txnPoolNodeSet) - 1))
            assert slow_node.master_replica.stasher.stash_size(
                STASH_VIEW_3PC) == old_stashed[0] + stashed_master_messages

        def chk():
            for inst_id, r in slow_node.replicas.items():
                assert r.last_ordered_3pc[1] == 2
                assert r.stasher.stash_size(STASH_VIEW_3PC) == 0

        looper.run(eventually(chk))
        waitNodeDataEquality(looper, slow_node, *fast_nodes)
def test_add_node_delay_commit_on_one(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_steward, tdir, tconf, allPluginsPath):
    view_no = txnPoolNodeSet[-1].viewNo
    # Add a New node but don't allow Delta to be aware of it. We do not want it in Delta's node registry.
    with delay_rules(txnPoolNodeSet[-1].nodeIbStasher, cDelay()):
        _, new_node = sdk_add_new_steward_and_node(looper, sdk_pool_handle, sdk_wallet_steward,
                                                   'New_Steward', 'Epsilon',
                                                   tdir, tconf, allPluginsPath=allPluginsPath)
        txnPoolNodeSet.append(new_node)
        looper.run(checkNodesConnected(txnPoolNodeSet[:-2] + [new_node]))
        waitForViewChange(looper, txnPoolNodeSet, view_no + 1)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
Ejemplo n.º 34
0
def testInstChangeWithMoreReqLat(looper, setup):
    # TODO for now, view_change procedure can take more that 15 minutes
    # (5 minutes for catchup and 10 minutes for primary's answer).
    # Therefore, view_change triggering by max latency now is not indicative.
    nodes = setup.nodes
    old_view_no = setup.old_view_no
    for node in nodes:
        node.checkPerformance()
        assert any(
            getAllReturnVals(node.monitor,
                             node.monitor.isMasterReqLatencyTooHigh))

    waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
Ejemplo n.º 35
0
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper,
                                                  sdk_pool_handle,
                                                  sdk_wallet_steward, viewNo):
    """
    Node will change view even though it does not find the master to be degraded
    when a quorum of nodes agree that master performance degraded
    """

    m_primary_node = get_master_primary_node(list(txnPoolNodeSet))

    # Delay processing of PRE-PREPARE from all non primary replicas of master
    # so master's performance falls and view changes
    delayNonPrimaries(txnPoolNodeSet, 0, 10)

    pr = getPrimaryReplica(txnPoolNodeSet, 0)
    relucatantNode = pr.node

    # Count sent instance changes of all nodes
    sentInstChanges = {}
    instChngMethodName = ViewChanger.sendInstanceChange.__name__
    for n in txnPoolNodeSet:
        sentInstChanges[n.name] = n.view_changer.spylog.count(
            instChngMethodName)

    # Node reluctant to change view, never says master is degraded
    relucatantNode.monitor.isMasterDegraded = types.MethodType(
        lambda x: False, relucatantNode.monitor)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 4)

    for n in txnPoolNodeSet:
        n.checkPerformance()

    # Check that view change happened for all nodes
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)

    # All nodes except the reluctant node should have sent a view change and
    # thus must have called `sendInstanceChange`
    for n in txnPoolNodeSet:
        if n.name != relucatantNode.name:
            assert n.view_changer.spylog.count(instChngMethodName) > \
                   sentInstChanges.get(n.name, 0)
        else:
            assert n.view_changer.spylog.count(instChngMethodName) == \
                   sentInstChanges.get(n.name, 0)

    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet))
    assert m_primary_node.name != new_m_primary_node.name
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
Ejemplo n.º 36
0
def check_view_change_one_slow_node(looper,
                                    txnPoolNodeSet,
                                    sdk_pool_handle,
                                    sdk_wallet_client,
                                    vc_counts,
                                    slow_node_is_next_primary,
                                    delay_commit=True,
                                    delay_pre_prepare=True):
    current_view_no = checkViewNoForNodes(txnPoolNodeSet)
    expected_view_no = current_view_no + vc_counts
    next_primary = get_next_primary_name(txnPoolNodeSet, expected_view_no)
    pretenders = [
        r.node for r in getNonPrimaryReplicas(txnPoolNodeSet)
        if not r.isPrimary
    ]
    if slow_node_is_next_primary:
        delayed_node = [n for n in pretenders if n.name == next_primary][0]
    else:
        delayed_node = [n for n in pretenders if n.name != next_primary][0]
    fast_nodes = [node for node in txnPoolNodeSet if node != delayed_node]

    delayers = []
    if delay_pre_prepare:
        delayers.append(ppDelay())
        delayers.append(msg_rep_delay(types_to_delay=[PREPREPARE]))
    if delay_commit:
        delayers.append(cDelay())

    # delay OldViewPrePrepareReply so that slow node doesn't receive PrePrepares before ReOrdering phase finishes
    with delay_rules(delayed_node.nodeIbStasher, old_view_pp_reply_delay()):
        with delay_rules_without_processing(delayed_node.nodeIbStasher,
                                            *delayers):
            sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                                      sdk_wallet_client, 1)
            trigger_view_change(txnPoolNodeSet)
            if vc_counts == 2:
                for node in txnPoolNodeSet:
                    node.master_replica.internal_bus.send(
                        NodeNeedViewChange(current_view_no + 2))

        waitForViewChange(looper=looper,
                          txnPoolNodeSet=txnPoolNodeSet,
                          expectedViewNo=expected_view_no)
        ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30)

        # wait till fast nodes finish re-ordering
        looper.run(eventually(check_has_commits, fast_nodes))

    sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client,
                               sdk_pool_handle)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_recover_stop_primaries(looper, checkpoint_size, txnPoolNodeSet,
                                allPluginsPath, tdir, tconf, sdk_pool_handle,
                                sdk_wallet_steward):
    """
    Test that we can recover after having more than f nodes disconnected:
    - stop current master primary (Alpha)
    - send txns
    - restart current master primary (Beta)
    - send txns
    """

    active_nodes = list(txnPoolNodeSet)
    assert 4 == len(active_nodes)
    initial_view_no = active_nodes[0].viewNo

    logger.info("Stop first node (current Primary)")
    _, active_nodes = stop_primary(looper, active_nodes)

    logger.info("Make sure view changed")
    expected_view_no = initial_view_no + 1
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)
    ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2))
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("send at least one checkpoint")
    assert nodes_do_not_have_checkpoints(*active_nodes)
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 2 * checkpoint_size)
    assert nodes_have_checkpoints(*active_nodes)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("Stop second node (current Primary) so the primary looses his state")
    stopped_node, active_nodes = stop_primary(looper, active_nodes)

    logger.info("Restart the primary node")
    restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath)
    assert nodes_do_not_have_checkpoints(restarted_node)
    assert nodes_have_checkpoints(*active_nodes)
    active_nodes = active_nodes + [restarted_node]

    logger.info("Check that primary selected")
    ensureElectionsDone(looper=looper, nodes=active_nodes,
                        instances_list=range(2), customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)

    logger.info("Check if the pool is able to process requests")
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 10 * checkpoint_size)
    ensure_all_nodes_have_same_data(looper, nodes=active_nodes)
    assert nodes_have_checkpoints(*active_nodes)
def test_view_change_min_catchup_timeout(txnPoolNodeSet, looper,
                                         sdk_pool_handle,
                                         sdk_wallet_client,
                                         tconf,
                                         viewNo):
    """
    One of the conditions to finish catch-up during view change is to have MAX_CATCHUPS_DONE_DURING_VIEW_CHANGE
    rounds of catch-up without any new transactions caught up.
    But this should not finish very quickly.
    So, we should try to catch-up until MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE.

    In the test:
    - Before starting view change, mock `has_ordered_till_last_prepared_certificate` so that it always returns False.
    - This means that the only condition on how we can finish catch-up is by MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE
    timeout and having more than MAX_CATCHUPS_DONE_DURING_VIEW_CHANGE rounds of catch-up without new txns caught up.
     - Check that view change is not finished until MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE
     - Check that view change is eventually finished after MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE
    """

    # 1. Send some txns
    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 4)

    # 2. make the only condition to finish catch-up is
    # MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE
    patch_has_ordered_till_last_prepared_certificate(txnPoolNodeSet)

    # 3. start view change
    expected_view_no = viewNo + 1
    for node in txnPoolNodeSet:
        node.view_changer.startViewChange(expected_view_no)

    # 4. check that it's not finished till
    # MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE
    no_view_chanage_timeout = tconf.MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE - 1
    with pytest.raises(EventuallyTimeoutException):
        ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet,
                            customTimeout=no_view_chanage_timeout)

    # 5. make sure that view change is finished eventually
    # (it should be finished quite soon after we waited for MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=2)
    waitForViewChange(looper=looper, txnPoolNodeSet=txnPoolNodeSet,
                      expectedViewNo=expected_view_no)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    # 6. ensure that the pool is still functional.
    sdk_ensure_pool_functional(looper, txnPoolNodeSet,
                               sdk_wallet_client,
                               sdk_pool_handle)
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath,
                                     tdir, tconf, sdk_pool_handle, sdk_wallet_client):
    """
    Check that quorum f + 1 is used for primary selection
    when initiated by CurrentState messages.

    Assumes that view change quorum is n - f.
    Assumes that primaries selection in round robin fashion.
    """

    # Ensure that we have 4 nodes in total
    all_nodes = list(txnPoolNodeSet)
    assert 4 == len(all_nodes)
    alpha, beta, delta, gamma = all_nodes
    initial_view_no = alpha.viewNo

    # Make one node lagging by switching it off for some time
    lagging_node = gamma
    non_lagging_nodes = [alpha, beta, delta]
    disconnect_node_and_ensure_disconnected(looper,
                                            all_nodes,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Make nodes to perform view change
    ensure_view_change(looper, non_lagging_nodes)
    ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, instances_list=range(2))
    ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes)

    # Stop two more of active nodes
    # (but not primary, which is Beta (because of round robin selection))
    stopped_nodes = [alpha]  # TODO: add one more here
    for stopped_node in stopped_nodes:
        disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet,
                                                stopped_node, stopNode=True)
        looper.removeProdable(stopped_node)

    # Start lagging node back
    restarted_node = start_stopped_node(
        lagging_node, looper, tconf, tdir, allPluginsPath)
    active_nodes = [beta, delta, restarted_node]

    # Check that primary selected
    expected_view_no = initial_view_no + 1
    ensureElectionsDone(looper=looper, nodes=active_nodes,
                        instances_list=range(2), customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_restart_to_same_view_with_killed_primary(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath,
                                                  sdk_pool_handle, sdk_wallet_client):
    restart_timeout = tconf.ToleratePrimaryDisconnection + \
                      waits.expectedPoolElectionTimeout(len(txnPoolNodeSet))

    primary = txnPoolNodeSet[0]
    alive_nodes = txnPoolNodeSet[1:]
    minority = alive_nodes[-1:]
    majority = alive_nodes[:-1]

    # Move to higher view by killing primary
    primary.cleanupOnStopping = True
    primary.stop()
    looper.removeProdable(primary)
    ensure_node_disconnected(looper, primary, txnPoolNodeSet)
    waitForViewChange(looper, alive_nodes, 1, customTimeout=VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper, alive_nodes, instances_list=range(3))

    # Add transaction to ledger
    sdk_send_random_and_check(looper, alive_nodes, sdk_pool_handle, sdk_wallet_client, 1)

    # Restart majority group
    majority_before_restart = majority.copy()
    restart_nodes(looper, alive_nodes, majority, tconf, tdir, allPluginsPath,
                  after_restart_timeout=restart_timeout, start_one_by_one=False, wait_for_elections=False)
    waitForViewChange(looper, majority, 1, customTimeout=2.1 * VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper, majority, instances_list=range(3))

    # Check that nodes in minority group are aware that they might have inconsistent 3PC state
    for node in minority:
        assert node.spylog.count(node.on_inconsistent_3pc_state) == 1

    # Check that nodes in majority group didn't think they might have inconsistent 3PC state
    for node in majority_before_restart:
        assert node.spylog.count(node.on_inconsistent_3pc_state) == 0

    # Check that nodes in majority group don't think they might have inconsistent 3PC state
    for node in majority:
        assert node.spylog.count(node.on_inconsistent_3pc_state) == 0

    # Restart minority group
    restart_nodes(looper, alive_nodes, minority, tconf, tdir, allPluginsPath,
                  after_restart_timeout=restart_timeout, start_one_by_one=False, wait_for_elections=False)
    ensureElectionsDone(looper, alive_nodes, instances_list=range(3))

    # Check that all nodes are still functional
    sdk_ensure_pool_functional(looper, alive_nodes, sdk_wallet_client, sdk_pool_handle)
def test_restart_primaries_then_demote(
        looper, txnPoolNodeSet,
        tconf, tdir, allPluginsPath,
        sdk_pool_handle,
        sdk_wallet_stewards):
    """
    """
    sdk_wallet_steward = sdk_wallet_stewards[0]
    logger.info("1. Restart Node1")
    pool_of_nodes = ensure_view_change_by_primary_restart(looper,
                                                          txnPoolNodeSet,
                                                          tconf,
                                                          tdir,
                                                          allPluginsPath,
                                                          customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle,
                              sdk_wallet_steward, 1)

    logger.info("2. Restart Node2")
    pool_of_nodes = ensure_view_change_by_primary_restart(looper,
                                                          pool_of_nodes,
                                                          tconf,
                                                          tdir,
                                                          allPluginsPath,
                                                          customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle,
                              sdk_wallet_steward, 1)

    logger.info("3. Demote Node3")
    # demote the node
    pool_of_nodes = demote_primary_node(looper,
                                        txnPoolNodeSet,
                                        pool_of_nodes,
                                        sdk_pool_handle,
                                        sdk_wallet_stewards)

    # make sure view changed
    waitForViewChange(looper, pool_of_nodes, expectedViewNo=3)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle,
                              sdk_wallet_steward, 10)
    ensure_all_nodes_have_same_data(looper, nodes=pool_of_nodes)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper,
                                                  sdk_pool_handle,
                                                  sdk_wallet_steward,
                                                  viewNo):
    """
    Node will change view even though it does not find the master to be degraded
    when a quorum of nodes agree that master performance degraded
    """

    m_primary_node = get_master_primary_node(list(txnPoolNodeSet))

    # Delay processing of PRE-PREPARE from all non primary replicas of master
    # so master's performance falls and view changes
    delayNonPrimaries(txnPoolNodeSet, 0, 10)

    pr = getPrimaryReplica(txnPoolNodeSet, 0)
    relucatantNode = pr.node

    # Count sent instance changes of all nodes
    sentInstChanges = {}
    instChngMethodName = ViewChanger.sendInstanceChange.__name__
    for n in txnPoolNodeSet:
        sentInstChanges[n.name] = n.view_changer.spylog.count(instChngMethodName)

    # Node reluctant to change view, never says master is degraded
    relucatantNode.monitor.isMasterDegraded = types.MethodType(
        lambda x: False, relucatantNode.monitor)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_steward, 4)
    # Check that view change happened for all nodes
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)

    # All nodes except the reluctant node should have sent a view change and
    # thus must have called `sendInstanceChange`
    for n in txnPoolNodeSet:
        if n.name != relucatantNode.name:
            assert n.view_changer.spylog.count(instChngMethodName) > \
                   sentInstChanges.get(n.name, 0)
        else:
            assert n.view_changer.spylog.count(instChngMethodName) == \
                   sentInstChanges.get(n.name, 0)

    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet))
    assert m_primary_node.name != new_m_primary_node.name
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle,
                                                     sdk_wallet_client, tdir, tconf, allPluginsPath):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    old_view_no = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet,
                                            old_pr_node, stopNode=True)
    looper.removeProdable(old_pr_node)

    remaining_nodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remaining_nodes, old_view_no + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
    new_pr_node = get_master_primary_node(remaining_nodes)
    assert old_pr_node != new_pr_node

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5)

    # Check if old primary can join the pool and still functions
    old_pr_node = start_stopped_node(old_pr_node, looper, tconf,
                                     tdir, allPluginsPath)

    txnPoolNodeSet = remaining_nodes + [old_pr_node]
    looper.run(eventually(checkViewNoForNodes,
                          txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT))
    assert len(getAllReturnVals(old_pr_node.view_changer,
                                old_pr_node.view_changer._start_view_change_if_possible,
                                compare_val_to=True)) > 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    assert not old_pr_node.view_changer._next_view_indications
def test_replica_removing_with_primary_disconnected(looper,
                                                    txnPoolNodeSet,
                                                    sdk_pool_handle,
                                                    sdk_wallet_client,
                                                    tconf,
                                                    tdir,
                                                    allPluginsPath):
    """
    1. Remove backup primary node.
    2. Check that replicas with the disconnected primary were removed.
    3. Recover the removed node.
    4. Start View Change.
    5. Check that all replicas were restored.
    """
    start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas
    instance_to_remove = 1
    node = txnPoolNodeSet[instance_to_remove]
    # remove backup primary node.
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node)
    txnPoolNodeSet.remove(node)
    looper.removeProdable(node)

    # check that replicas were removed
    def check_replica_removed_on_all_nodes():
        for node in txnPoolNodeSet:
            check_replica_removed(node,
                                  start_replicas_count,
                                  instance_to_remove)
    looper.run(eventually(check_replica_removed_on_all_nodes,
                          timeout=tconf.TolerateBackupPrimaryDisconnection * 4))
    assert not node.monitor.isMasterDegraded()
    assert len(node.requests) == 0

    # recover the removed node
    node = start_stopped_node(node, looper, tconf,
                              tdir, allPluginsPath)
    txnPoolNodeSet.append(node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    # start View Change
    for node in txnPoolNodeSet:
        node.view_changer.on_master_degradation()
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1,
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    # check that all replicas were restored
    assert start_replicas_count == node.replicas.num_replicas
def test_disable_view_change(
        disable_view_change_config,
        looper,
        txnPoolNodeSet,
        viewNo,
        sdk_pool_handle,
        sdk_wallet_steward):
    assert disable_view_change_config
    assert isinstance(disable_view_change_config.unsafe, set)
    assert 'disable_view_change' in disable_view_change_config.unsafe

    simulate_slow_master(looper, txnPoolNodeSet,
                         sdk_pool_handle,
                         sdk_wallet_steward)

    with pytest.raises(AssertionError):
        waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)
def test_view_change_on_performance_degraded(looper, txnPoolNodeSet, viewNo,
                                             sdk_pool_handle,
                                             sdk_wallet_steward):
    """
    Test that a view change is done when the performance of master goes down
    Send multiple requests from the client and delay some requests by master
    instance so that there is a view change. All nodes will agree that master
    performance degraded
    """
    old_primary_node = get_master_primary_node(list(txnPoolNodeSet))

    simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle,
                         sdk_wallet_steward)
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)

    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    new_primary_node = get_master_primary_node(list(txnPoolNodeSet))
    assert old_primary_node.name != new_primary_node.name
Ejemplo n.º 47
0
def do_test_replica_removing_with_backup_degraded(looper,
                                                  txnPoolNodeSet,
                                                  sdk_pool_handle,
                                                  sdk_wallet_client,
                                                  tconf):
    """
      Node will change view even though it does not find the master to be degraded
      when a quorum of nodes agree that master performance degraded
      """

    start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas
    view_no = txnPoolNodeSet[0].viewNo
    instance_to_remove = 1
    stashers = [node.nodeIbStasher for node in txnPoolNodeSet]
    with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=instance_to_remove)):
        sdk_send_batches_of_random_and_check(looper,
                                             txnPoolNodeSet,
                                             sdk_pool_handle,
                                             sdk_wallet_client,
                                             num_reqs=10,
                                             num_batches=5)

        # check that replicas were removed
        def check_replica_removed_on_all_nodes(inst_id=instance_to_remove):
            for n in txnPoolNodeSet:
                check_replica_removed(n,
                                      start_replicas_count,
                                      inst_id)
                assert not n.monitor.isMasterDegraded()

        looper.run(eventually(check_replica_removed_on_all_nodes, timeout=120))

    # start View Change
    for node in txnPoolNodeSet:
        node.view_changer.on_master_degradation()
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=view_no + 1,
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
    # check that all replicas were restored
    assert all(start_replicas_count == node.replicas.num_replicas
               for node in txnPoolNodeSet)
def testDiscardInstChngMsgFrmPastView(txnPoolNodeSet, looper, ensureView):
    """
    Once a view change is done, any further INSTANCE_CHANGE messages for that
    view must be discarded by the node.
    """

    curViewNo = ensureView

    # Send an instance change for an old instance message to all nodes
    icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(curViewNo, 0)
    txnPoolNodeSet[0].send(icMsg)

    # ensure every node but Alpha discards the invalid instance change request
    timeout = waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))

    # Check that that message is discarded.
    looper.run(eventually(checkDiscardMsg, txnPoolNodeSet, icMsg,
                          'which is not more than its view no',
                          txnPoolNodeSet[0], timeout=timeout))

    waitForViewChange(looper, txnPoolNodeSet)
def test_view_change_timeout_reset_on_next_view(txnPoolNodeSet, looper, tconf):
    # Check that all nodes are in view 0
    assert all(n.viewNo == 0 for n in txnPoolNodeSet)

    stashers = [n.nodeIbStasher for n in txnPoolNodeSet]
    with delay_rules(stashers, vcd_delay()):
        # Start first view change
        for n in txnPoolNodeSet:
            n.view_changer.on_master_degradation()
        waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1)
        looper.runFor(0.6 * VIEW_CHANGE_TIMEOUT)

        # Start second view change
        for n in txnPoolNodeSet:
            n.view_changer.on_master_degradation()
        waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=2)
        looper.runFor(0.6 * VIEW_CHANGE_TIMEOUT)

    # Ensure only 2 view changes happened
    ensureElectionsDone(looper, txnPoolNodeSet)
    for n in txnPoolNodeSet:
        assert n.viewNo == 2
def test_delayed_instance_changes_after_vcd_for_next_view(looper, txnPoolNodeSet):
    '''
    A node is doing view change to view=1, while the other nodes already finished view change to view=2.
    The node receives a quorum of VCD messages for view=2 before a quorum of InstanceChange messages for view=2.
    Nevertheless, the node should not start a view change to view=2 without a quorum of InstanceChanges,
    that is it should not go to propagate primary mode since it's already in view chanage state.
    The node should eventually finish view change to view=2 once receives all VCD and IS msgs for view=2
    '''
    nodes = txnPoolNodeSet
    slow_node = nodes[-1]
    fast_nodes = [n for n in nodes if n != slow_node]
    slow_stasher = slow_node.nodeIbStasher

    # 1. DO FIRST VIEW CHANGE

    # delay VCD for the first ViewChange
    with delay_rules(slow_stasher, vcd_delay()):
        # Trigger view change
        for n in nodes:
            n.view_changer.on_master_degradation()
        waitForViewChange(looper, nodes, expectedViewNo=1)

        # make sure view change is finished on all nodes except the slow one
        ensureElectionsDone(looper, fast_nodes, instances_list=range(3))

        # drop all VCD to view=1
        slow_stasher.drop_delayeds()

    # 2. DO SECOND VIEW CHANGE

    # delay Instance Changes and
    # so that the slow node receives VCD for view=2 before
    # a quorum of InstanceChanges for that view while still doing view change to view=1
    with delay_rules(slow_stasher, icDelay()):

        # Trigger view change
        for n in nodes:
            n.view_changer.on_master_degradation()
        waitForViewChange(looper, fast_nodes, expectedViewNo=2)

        # make sure view change is finished on all nodes except the slow one
        ensureElectionsDone(looper, fast_nodes, instances_list=range(3))

        # slow node is still on view=1
        assert slow_node.viewNo == 1
        assert slow_node.view_change_in_progress

        # make sure that the slow node receives VCD msgs for view=2
        # and didn't receive IS msgs for view=2
        check_vcd_msgs(slow_node, expected_view_no=2, expected_count=len(fast_nodes), )
        check_no_ic_msgs(slow_node, expected_view_no=2)

    # 3. RESET DELAYS AND CHECK

    waitForViewChange(looper, nodes, expectedViewNo=2)
    ensureElectionsDone(looper, nodes)
    assert not slow_node.view_change_in_progress
    ensure_all_nodes_have_same_data(looper, nodes=nodes)
def setup(txnPoolNodeSet, looper):
    m_primary_node = get_master_primary_node(list(txnPoolNodeSet))
    initial_view_no = waitForViewChange(looper, txnPoolNodeSet)
    timeout_callback_stats = _check_view_change_completed_stats(txnPoolNodeSet)
    return m_primary_node, initial_view_no, timeout_callback_stats
def test_replica_removing_after_view_change(looper,
                                            txnPoolNodeSet,
                                            sdk_pool_handle,
                                            sdk_wallet_client,
                                            tconf,
                                            tdir,
                                            allPluginsPath):
    """
    1. Remove backup primary node.
    2. Check that replicas with the disconnected primary were removed.
    3. Start View Change.
    4. Check that the new replica with disconnected primary is removed and
    other replicas are working correctly.
    5. Recover the removed node.
    6. Start View Change.
    7. Check that all replicas were restored.
    """
    start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas
    instance_to_remove = txnPoolNodeSet[0].requiredNumberOfInstances - 1
    removed_node = txnPoolNodeSet[instance_to_remove]
    # remove backup primary node.
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, removed_node)
    txnPoolNodeSet.remove(removed_node)
    looper.removeProdable(removed_node)

    # check that replicas were removed
    def check_replica_removed_on_all_nodes(inst_id=instance_to_remove):
        for n in txnPoolNodeSet:
            check_replica_removed(n,
                                  start_replicas_count,
                                  inst_id)
            assert not n.monitor.isMasterDegraded()
            assert len(n.requests) == 0

    looper.run(eventually(check_replica_removed_on_all_nodes,
                          timeout=tconf.TolerateBackupPrimaryDisconnection * 2))

    # start View Change
    for node in txnPoolNodeSet:
        node.view_changer.on_master_degradation()
    waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1,
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)
    instance_to_remove -= 1
    instances = list(range(txnPoolNodeSet[0].requiredNumberOfInstances))
    instances.remove(instance_to_remove)
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet,
                        instances_list=instances,
                        customTimeout=tconf.TolerateBackupPrimaryDisconnection * 4)
    # check that all replicas were restored
    looper.run(eventually(check_replica_removed_on_all_nodes,
                          instance_to_remove,
                          timeout=tconf.TolerateBackupPrimaryDisconnection * 2))

    # recover the removed node
    removed_node = start_stopped_node(removed_node, looper, tconf,
                                      tdir, allPluginsPath)
    txnPoolNodeSet.append(removed_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    # start View Change
    for node in txnPoolNodeSet:
        node.view_changer.on_master_degradation()
    ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet,
                        instances_list=range(txnPoolNodeSet[0].requiredNumberOfInstances),
                        customTimeout=tconf.TolerateBackupPrimaryDisconnection * 2)
    assert start_replicas_count == removed_node.replicas.num_replicas
def test_quorum_after_f_plus_2_nodes_including_primary_turned_off_and_later_on(
        looper, allPluginsPath, tdir, tconf,
        txnPoolNodeSet,
        sdk_pool_handle,
        sdk_wallet_client):
    timeout = sdk_eval_timeout(1, len(txnPoolNodeSet))
    nodes = txnPoolNodeSet

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              1)

    stop_node(nodes[0], looper, nodes)
    waitForViewChange(looper, nodes[1:], expectedViewNo=1)
    ensureElectionsDone(looper, nodes[1:],
                        instances_list=range(getRequiredInstances(nodeCount)))

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              1)

    stop_node(nodes[1], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[2:], expectedViewNo=1)

    sdk_reqs3 = sdk_send_random_requests(looper,
                                         sdk_pool_handle,
                                         sdk_wallet_client,
                                         1)
    with pytest.raises(PoolLedgerTimeoutException):
        req_res = sdk_get_replies(looper, sdk_reqs3, timeout=timeout)
        sdk_check_reply(req_res[0])

    stop_node(nodes[2], looper, nodes)
    looper.runFor(tconf.ToleratePrimaryDisconnection +
                  waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[3:], expectedViewNo=1)

    sdk_reqs4 = sdk_send_random_requests(looper,
                                         sdk_pool_handle,
                                         sdk_wallet_client,
                                         1)
    with pytest.raises(PoolLedgerTimeoutException):
        req_res = sdk_get_replies(looper, sdk_reqs4, timeout=timeout)
        sdk_check_reply(req_res[0])

    nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath)
    looper.runFor(waits.expectedPoolElectionTimeout(len(nodes)))
    checkViewNoForNodes(nodes[3:], expectedViewNo=1)

    sdk_reqs5 = sdk_send_random_requests(looper,
                                         sdk_pool_handle,
                                         sdk_wallet_client,
                                         1)
    with pytest.raises(PoolLedgerTimeoutException):
        req_res = sdk_get_replies(looper, sdk_reqs5, timeout=timeout)
        sdk_check_reply(req_res[0])

    nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir, allPluginsPath)
    ensureElectionsDone(looper, nodes[1:],
                        instances_list=range(getRequiredInstances(nodeCount)),
                        customTimeout=60)
    checkViewNoForNodes(nodes[1:], expectedViewNo=1)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              1)

    nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir, allPluginsPath)
    ensureElectionsDone(looper, nodes,
                        instances_list=range(getRequiredInstances(nodeCount)),
                        customTimeout=60)
    checkViewNoForNodes(nodes, expectedViewNo=1)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle,
                              sdk_wallet_client,
                              1)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection(
        looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle):
    """
    Verifies that a disconnected node with a lagged view accepts
    the current view from the other nodes on re-connection.
    Steps:
    1. Provoke view change to 1.
    2. Ensure that all the nodes complete view change to 1.
    3. Disconnect one node from the rest of the nodes in the pool.
    4. Provoke view change to 2.
    5. Ensure that that all the nodes except for the disconnected one complete
    view change to 2 and the disconnected node remains in the view 1.
    6. Provoke view change to 3.
    5. Ensure that that all the nodes except for the disconnected one complete
    view change to 3 and the disconnected node remains in the view 1.
    8. Connect the disconnected node to the rest of the nodes in the pool.
    9. Ensure that the re-connected node completes view change to 3.
    10. Ensure that all the nodes participate in consensus.
    """
    checkViewNoForNodes(txnPoolNodeSet, 0)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)

    ensure_view_change(looper, txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    checkViewNoForNodes(txnPoolNodeSet, 1)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)

    lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagged_node,
                                            stopNode=False)
    other_nodes = list(set(txnPoolNodeSet) - {lagged_node})

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)

    ensure_view_change(looper, other_nodes)
    ensureElectionsDone(looper, other_nodes,
                        instances_list=range(getRequiredInstances(len(txnPoolNodeSet))))
    ensure_all_nodes_have_same_data(looper, other_nodes)
    checkViewNoForNodes(other_nodes, 2)
    checkViewNoForNodes([lagged_node], 1)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)

    ensure_view_change(looper, other_nodes)
    ensureElectionsDone(looper, other_nodes,
                        instances_list=range(getRequiredInstances(len(txnPoolNodeSet))))
    ensure_all_nodes_have_same_data(looper, other_nodes)
    checkViewNoForNodes(other_nodes, 3)
    checkViewNoForNodes([lagged_node], 1)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)

    reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node)
    waitForViewChange(looper, [lagged_node], 3,
                      customTimeout=waits.expectedPoolElectionTimeout(
                          len(txnPoolNodeSet)))
    ensureElectionsDone(looper, txnPoolNodeSet)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    checkViewNoForNodes(txnPoolNodeSet, 3)

    sdk_send_random_and_check(looper, txnPoolNodeSet,
                              sdk_pool_handle, sdk_wallet_client, 1)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_view_change_after_back_to_quorum_with_disconnected_primary(txnPoolNodeSet, looper,
                                                                    sdk_pool_handle,
                                                                    sdk_wallet_client,
                                                                    tdir, tconf, allPluginsPath):
    assert len(txnPoolNodeSet) == 4

    pr_node = get_master_primary_node(txnPoolNodeSet)
    assert pr_node.name == "Alpha"

    # 1. Initiate view change be primary (Alpha) restart
    nodes = ensure_view_change_by_primary_restart(looper,
                                                  txnPoolNodeSet,
                                                  tconf,
                                                  tdir,
                                                  allPluginsPath,
                                                  customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # Now primary should be Beta
    pr_node = get_master_primary_node(nodes)
    assert pr_node.name == "Beta"

    # 2. Stop non-primary node Delta, no any view changes are expected
    non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0]
    disconnect_node_and_ensure_disconnected(
        looper, txnPoolNodeSet, non_primary_to_stop)
    looper.removeProdable(non_primary_to_stop)

    remaining_nodes = list(set(nodes) - {non_primary_to_stop})
    # Primary is going to be stopped, remember instance change messages count
    # to ensure that no view change happened as number of connected nodes is less
    # than quorum.
    ic_cnt = {}
    for n in remaining_nodes:
        ic_cnt[n.name] = n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__)

    # 3. Disconnect primary
    disconnect_node_and_ensure_disconnected(
        looper, remaining_nodes, pr_node)
    looper.removeProdable(pr_node)

    # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented.
    looper.runFor(tconf.ToleratePrimaryDisconnection + 5)
    remaining_nodes = list(set(remaining_nodes) - {pr_node})
    for n in remaining_nodes:
        assert ic_cnt[n.name] == n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__)

    view_no = checkViewNoForNodes(remaining_nodes)

    # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum
    # to choose a new one.
    restartedNode = start_stopped_node(non_primary_to_stop, looper, tconf,
                                       tdir, allPluginsPath,
                                       delay_instance_change_msgs=False)
    remaining_nodes = remaining_nodes + [restartedNode]

    # 5. Check that view change happened.
    waitForViewChange(looper, remaining_nodes, expectedViewNo=(view_no + 1),
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle,
                              sdk_wallet_client, 3)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
Ejemplo n.º 56
0
def ensureView(txnPoolNodeSet, looper):
    """
    Ensure that all the nodes in the txnPoolNodeSet are in the same view.
    """
    return waitForViewChange(looper, txnPoolNodeSet)