def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1,
                               tdirWithPoolTxns, client1Connected):
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)

    npr = [n for n in txnPoolNodeSet if not n.hasPrimary]
    nodeToCrash = npr[0]
    idxToCrash = txnPoolNodeSet.index(nodeToCrash)
    otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash]

    def checkFlakyConnected(conn=True):
        for node in otherNodes:
            if conn:
                assert nodeToCrash.nodestack.name in node.nodestack.connecteds
            else:
                assert nodeToCrash.nodestack.name not in node.nodestack.connecteds

    checkFlakyConnected(True)
    nodeToCrash.stop()
    looper.removeProdable(nodeToCrash)
    looper.runFor(1)
    looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=35))
    looper.runFor(1)
    node = TestNode(nodeToCrash.name,
                    basedirpath=tdirWithPoolTxns,
                    config=tconf,
                    ha=nodeToCrash.nodestack.ha,
                    cliha=nodeToCrash.clientstack.ha)
    looper.add(node)
    txnPoolNodeSet[idxToCrash] = node
    looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50))
    ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2, timeout=50)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)
    checkNodesSendingCommits(txnPoolNodeSet)
def test_state_regenerated_from_ledger(looper, txnPoolNodeSet, client1,
                                       wallet1, client1Connected, tconf,
                                       tdirWithPoolTxns, allPluginsPath):
    """
    Node loses its state database but recreates it from ledger after start
    """
    sent_batches = 10
    send_reqs_batches_and_get_suff_replies(looper, wallet1, client1,
                                           5 * sent_batches, sent_batches)
    ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
    node_to_stop = txnPoolNodeSet[-1]
    node_state = node_to_stop.states[DOMAIN_LEDGER_ID]
    assert not node_state.isEmpty
    state_db_path = node_state._kv.db_path
    nodeHa, nodeCHa = HA(*node_to_stop.nodestack.ha), HA(
        *node_to_stop.clientstack.ha)

    node_to_stop.stop()
    looper.removeProdable(node_to_stop)

    shutil.rmtree(state_db_path)

    restarted_node = TestNode(node_to_stop.name,
                              basedirpath=tdirWithPoolTxns,
                              config=tconf,
                              ha=nodeHa,
                              cliha=nodeCHa,
                              pluginPaths=allPluginsPath)
    looper.add(restarted_node)
    txnPoolNodeSet[-1] = restarted_node

    looper.run(checkNodesConnected(txnPoolNodeSet))
    waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1,
                                               client1, client1Connected,
                                               tconf):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    viewNoBefore = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    stopNodes([old_pr_node], looper)
    looper.removeProdable(old_pr_node)
    remainingNodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remainingNodes, viewNoBefore + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remainingNodes)
    new_pr_node = get_master_primary_node(remainingNodes)
    assert old_pr_node != new_pr_node

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def testChangeHaPersistsPostNodesRestart(looper, txnPoolNodeSet, tdir, tconf,
                                         sdk_pool_handle, sdk_wallet_client,
                                         sdk_wallet_steward):
    new_steward_wallet, new_node = \
        sdk_add_new_steward_and_node(looper,
                                     sdk_pool_handle,
                                     sdk_wallet_steward,
                                     'AnotherSteward' + randomString(4),
                                     'AnotherNode' + randomString(4),
                                     tdir,
                                     tconf)
    txnPoolNodeSet.append(new_node)
    looper.run(checkNodesConnected(txnPoolNodeSet))
    sdk_pool_refresh(looper, sdk_pool_handle)

    node_new_ha, client_new_ha = genHa(2)
    logger.debug("{} changing HAs to {} {}".format(new_node, node_new_ha,
                                                   client_new_ha))

    # Making the change HA txn an confirming its succeeded
    node_dest = hexToFriendly(new_node.nodestack.verhex)
    sdk_send_update_node(looper, new_steward_wallet, sdk_pool_handle,
                         node_dest, new_node.name, node_new_ha.host,
                         node_new_ha.port, client_new_ha.host,
                         client_new_ha.port)

    # Stopping existing nodes
    for node in txnPoolNodeSet:
        node.stop()
        looper.removeProdable(node)

    # Starting nodes again by creating `Node` objects since that simulates
    # what happens when starting the node with script
    restartedNodes = []
    for node in txnPoolNodeSet[:-1]:
        config_helper = PNodeConfigHelper(node.name, tconf, chroot=tdir)
        restartedNode = TestNode(node.name,
                                 config_helper=config_helper,
                                 config=tconf,
                                 ha=node.nodestack.ha,
                                 cliha=node.clientstack.ha)
        looper.add(restartedNode)
        restartedNodes.append(restartedNode)

    # Starting the node whose HA was changed
    config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir)
    node = TestNode(new_node.name,
                    config_helper=config_helper,
                    config=tconf,
                    ha=node_new_ha,
                    cliha=client_new_ha)
    looper.add(node)
    restartedNodes.append(node)

    looper.run(checkNodesConnected(restartedNodes))
    waitNodeDataEquality(looper, node, *restartedNodes[:-1])
    sdk_pool_refresh(looper, sdk_pool_handle)
    sdk_ensure_pool_functional(looper, restartedNodes, sdk_wallet_client,
                               sdk_pool_handle)
Beispiel #5
0
def test_node_load_after_add_then_disconnect(newNodeCaughtUp, txnPoolNodeSet,
                                             tconf, looper, client1, wallet1,
                                             client1Connected,
                                             tdirWithPoolTxns, allPluginsPath,
                                             poolTxnStewardData, capsys):
    """
    A node that restarts after some transactions should eventually get the
    transactions which happened while it was down
    :return:
    """
    new_node = newNodeCaughtUp
    with capsys.disabled():
        print("Stopping node {} with pool ledger size {}".
              format(new_node, new_node.poolManager.txnSeqNo))
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node)
    looper.removeProdable(new_node)

    client_batches = 80
    txns_per_batch = 10
    for i in range(client_batches):
        s = perf_counter()
        sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1,
                                            txns_per_batch,
                                            override_timeout_limit=True)
        with capsys.disabled():
            print('{} executed {} client txns in {:.2f} seconds'.
                  format(i + 1, txns_per_batch, perf_counter() - s))

    with capsys.disabled():
        print("Starting the stopped node, {}".format(new_node))
    nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha)
    new_node = TestNode(
        new_node.name,
        basedirpath=tdirWithPoolTxns,
        config=tconf,
        ha=nodeHa,
        cliha=nodeCHa,
        pluginPaths=allPluginsPath)
    looper.add(new_node)
    txnPoolNodeSet[-1] = new_node

    # Delay catchup reply processing so LedgerState does not change
    delay_catchup_reply = 5
    new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply))
    looper.run(checkNodesConnected(txnPoolNodeSet))

    # Make sure ledger starts syncing (sufficient consistency proofs received)
    looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID,
                          LedgerState.syncing, retryWait=.5, timeout=5))

    # Not accurate timeout but a conservative one
    timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \
        2 * delay_catchup_reply
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4],
                         customTimeout=timeout)

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
    waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
Beispiel #6
0
def stop_primary(looper, active_nodes):
    stopped_node = active_nodes[0]
    disconnect_node_and_ensure_disconnected(looper,
                                            active_nodes,
                                            stopped_node,
                                            stopNode=True)
    looper.removeProdable(stopped_node)
    active_nodes = active_nodes[1:]
    return stopped_node, active_nodes
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath,
                                     tconf, client1, wallet1,
                                     client1Connected):
    """
    Check that quorum f + 1 is used for primary selection
    when initiated by CurrentState messages.

    Assumes that view change quorum is n - f.
    Assumes that primaries selection in round robin fashion.
    """

    # Ensure that we have 4 nodes in total
    all_nodes = list(txnPoolNodeSet)
    assert 4 == len(all_nodes)
    alpha, beta, delta, gamma = all_nodes
    initial_view_no = alpha.viewNo

    # Make one node lagging by switching it off for some time
    lagging_node = gamma
    non_lagging_nodes = [alpha, beta, delta]
    disconnect_node_and_ensure_disconnected(looper,
                                            all_nodes,
                                            lagging_node,
                                            stopNode=True)
    looper.removeProdable(lagging_node)

    # Make nodes to perform view change
    ensure_view_change(looper, non_lagging_nodes)
    ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, numInstances=2)
    ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes)

    # Stop two more of active nodes
    # (but not primary, which is Beta (because of round robin selection))
    stopped_nodes = [alpha]  # TODO: add one more here
    for stopped_node in stopped_nodes:
        disconnect_node_and_ensure_disconnected(looper,
                                                txnPoolNodeSet,
                                                stopped_node,
                                                stopNode=True)
        looper.removeProdable(stopped_node)

    # Start lagging node back
    restarted_node = start_stopped_node(lagging_node, looper, tconf,
                                        lagging_node.basedirpath,
                                        allPluginsPath)
    active_nodes = [beta, delta, restarted_node]

    # Check that primary selected
    expected_view_no = initial_view_no + 1
    ensureElectionsDone(looper=looper,
                        nodes=active_nodes,
                        numInstances=2,
                        customTimeout=30)
    waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no)

    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=1)
Beispiel #8
0
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper,
                                                     sdk_pool_handle,
                                                     sdk_wallet_client, tdir,
                                                     tconf, allPluginsPath):
    """
    View change occurs when master's primary is disconnected
    """

    # Setup
    nodes = txnPoolNodeSet

    old_view_no = checkViewNoForNodes(nodes)
    old_pr_node = get_master_primary_node(nodes)

    # Stop primary
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            old_pr_node,
                                            stopNode=True)
    looper.removeProdable(old_pr_node)

    remaining_nodes = list(set(nodes) - {old_pr_node})
    # Sometimes it takes time for nodes to detect disconnection
    ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20)

    looper.runFor(tconf.ToleratePrimaryDisconnection + 2)

    # Give some time to detect disconnection and then verify that view has
    # changed and new primary has been elected
    waitForViewChange(looper, remaining_nodes, old_view_no + 1)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
    new_pr_node = get_master_primary_node(remaining_nodes)
    assert old_pr_node != new_pr_node

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, 5)

    # Check if old primary can join the pool and still functions
    old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir,
                                     allPluginsPath)

    txnPoolNodeSet = remaining_nodes + [old_pr_node]
    looper.run(
        eventually(checkViewNoForNodes,
                   txnPoolNodeSet,
                   old_view_no + 1,
                   timeout=10))
    assert len(
        getAllReturnVals(
            old_pr_node.view_changer,
            old_pr_node.view_changer._start_view_change_if_possible,
            compare_val_to=True)) > 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    assert not old_pr_node.view_changer._next_view_indications
def test_primary_selection_after_primary_demotion_and_pool_restart(
        looper, txnPoolNodeSet, stewardAndWalletForMasterNode,
        txnPoolMasterNodes, tdir, tconf):
    """
    Demote primary and restart the pool.
    Pool should select new primary and have viewNo=0 after restart.
    """

    logger.info(
        "1. turn off the node which has primary replica for master instanse")
    master_node = txnPoolMasterNodes[0]
    client, wallet = stewardAndWalletForMasterNode

    node_data = {ALIAS: master_node.name, SERVICES: []}
    updateNodeData(looper, client, wallet, master_node, node_data)

    restNodes = [
        node for node in txnPoolNodeSet if node.name != master_node.name
    ]
    ensureElectionsDone(looper, restNodes)

    # ensure pool is working properly
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=3)

    logger.info("2. restart pool")
    # Stopping existing nodes
    for node in txnPoolNodeSet:
        node.stop()
        looper.removeProdable(node)

    # Starting nodes again by creating `Node` objects since that simulates
    # what happens when starting the node with script
    restartedNodes = []
    for node in txnPoolNodeSet:
        config_helper = PNodeConfigHelper(node.name, tconf, chroot=tdir)
        restartedNode = TestNode(node.name,
                                 config_helper=config_helper,
                                 config=tconf,
                                 ha=node.nodestack.ha,
                                 cliha=node.clientstack.ha)
        looper.add(restartedNode)
        restartedNodes.append(restartedNode)

    restNodes = [
        node for node in restartedNodes if node.name != master_node.name
    ]

    looper.run(checkNodesConnected(restNodes))
    ensureElectionsDone(looper, restNodes)
    checkViewNoForNodes(restNodes, 0)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=3)

    primariesIdxs = getPrimaryNodesIdxs(restNodes)
    assert restNodes[primariesIdxs[0]].name != master_node.name
Beispiel #10
0
    def start_stop_one_node(node_to_restart, pool_of_nodes):
        """

        :param node_to_restart: node, which would be restarted
        :param pool_of_nodes: current pool
        :return: new pool with restarted node
        Node restart procedure consist of:
        1. Calling stop()
        2. Remove from looper and pool
        3. Create new instance of node with the same ha, cliha and node_name
        (also all path to data, keys and etc would be exactly as for stopped node)
        4. Add new instance into looper and pool
        5. Check, that other nodes accepted new instance and all pool has the same data
        """

        remaining_nodes = list(set(pool_of_nodes) - {node_to_restart})
        disconnect_node_and_ensure_disconnected(looper,
                                                pool_of_nodes,
                                                node_to_restart,
                                                stopNode=True)
        looper.removeProdable(node_to_restart)
        ensure_all_nodes_have_same_data(
            looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT)
        sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 1)
        node_to_restart = start_stopped_node(node_to_restart,
                                             looper,
                                             tconf,
                                             tdir,
                                             allPluginsPath,
                                             delay_instance_change_msgs=True)
        pool_of_nodes = remaining_nodes + [node_to_restart]
        looper.run(checkNodesConnected(pool_of_nodes))
        ensure_all_nodes_have_same_data(
            looper, pool_of_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT)
        timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes))
        looper.run(
            eventually(check_ledger_state,
                       node_to_restart,
                       DOMAIN_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
        looper.run(
            eventually(check_ledger_state,
                       node_to_restart,
                       POOL_LEDGER_ID,
                       LedgerState.synced,
                       retryWait=.5,
                       timeout=timeout))
        looper.run(eventually(catchuped, node_to_restart, timeout=2 * timeout))
        return pool_of_nodes
Beispiel #11
0
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view,
                                                   looper, txnPoolNodeSet,
                                                   tdir, allPluginsPath, tconf,
                                                   wallet1, client1):
    """
    An existing non-primary node crashes and then view change happens,
    the crashed node comes back up after view change
    """
    node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node
    old_view_no = node_to_stop.viewNo

    # Stop non-primary
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            node_to_stop,
                                            stopNode=True)
    looper.removeProdable(node_to_stop)
    remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop})

    # Send some requests before view change
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
    ensure_view_change(looper,
                       remaining_nodes,
                       custom_timeout=tconf.VIEW_CHANGE_TIMEOUT)
    ensureElectionsDone(looper, remaining_nodes)
    # Send some requests after view change
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)

    restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir,
                                        allPluginsPath)
    txnPoolNodeSet = remaining_nodes + [restarted_node]
    looper.run(
        eventually(checkViewNoForNodes,
                   txnPoolNodeSet,
                   old_view_no + 1,
                   timeout=10))
    assert len(
        getAllReturnVals(
            restarted_node.view_changer,
            restarted_node.view_changer._start_view_change_if_possible,
            compare_val_to=True)) > 0

    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
    ensureElectionsDone(looper, txnPoolNodeSet)
    assert not restarted_node.view_changer._next_view_indications
Beispiel #12
0
def test_node_load_after_disconnect(looper, txnPoolNodeSet, tconf,
                                    tdirWithPoolTxns, allPluginsPath,
                                    poolTxnStewardData, capsys):

    client, wallet = buildPoolClientAndWallet(poolTxnStewardData,
                                              tdirWithPoolTxns,
                                              clientClass=TestClient)
    looper.add(client)
    looper.run(client.ensureConnectedToNodes())

    nodes = txnPoolNodeSet
    x = nodes[-1]

    with capsys.disabled():
        print("Stopping node {} with pool ledger size {}".format(
            x, x.poolManager.txnSeqNo))

    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, x)
    looper.removeProdable(x)

    client_batches = 80
    txns_per_batch = 10
    for i in range(client_batches):
        s = perf_counter()
        sendReqsToNodesAndVerifySuffReplies(looper,
                                            wallet,
                                            client,
                                            txns_per_batch,
                                            override_timeout_limit=True)
        with capsys.disabled():
            print('{} executed {} client txns in {:.2f} seconds'.format(
                i + 1, txns_per_batch,
                perf_counter() - s))

    nodeHa, nodeCHa = HA(*x.nodestack.ha), HA(*x.clientstack.ha)
    newNode = TestNode(x.name,
                       basedirpath=tdirWithPoolTxns,
                       base_data_dir=tdirWithPoolTxns,
                       config=tconf,
                       ha=nodeHa,
                       cliha=nodeCHa,
                       pluginPaths=allPluginsPath)
    looper.add(newNode)
    txnPoolNodeSet[-1] = newNode
    looper.run(checkNodesConnected(txnPoolNodeSet))
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1,
                               tdir, client1Connected):
    sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1)

    npr = [n for n in txnPoolNodeSet if not n.hasPrimary]
    nodeToCrash = npr[0]
    idxToCrash = txnPoolNodeSet.index(nodeToCrash)
    otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash]

    def checkFlakyConnected(conn=True):
        for node in otherNodes:
            if conn:
                assert nodeToCrash.nodestack.name in node.nodestack.connecteds
            else:
                assert nodeToCrash.nodestack.name not in node.nodestack.connecteds

    checkFlakyConnected(True)
    nodeToCrash.stop()
    logger.debug('Stopped node {}'.format(nodeToCrash))
    looper.removeProdable(nodeToCrash)
    looper.runFor(1)
    stopNodes([nodeToCrash], looper)
    # TODO Select or create the timeout from 'waits'. Don't use constant.
    looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=60))

    looper.runFor(1)
    config_helper = PNodeConfigHelper(nodeToCrash.name, tconf, chroot=tdir)
    node = TestNode(nodeToCrash.name,
                    ledger_dir=config_helper.ledger_dir,
                    keys_dir=config_helper.keys_dir,
                    genesis_dir=config_helper.genesis_dir,
                    plugins_dir=config_helper.plugins_dir,
                    config=tconf,
                    ha=nodeToCrash.nodestack.ha,
                    cliha=nodeToCrash.clientstack.ha)
    looper.add(node)
    txnPoolNodeSet[idxToCrash] = node

    # TODO Select or create the timeout from 'waits'. Don't use constant.
    looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50))
    ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2)
    ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)

    send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10)
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, client1,
                                       wallet1, client1Connected, monkeypatch):
    """
    Checks that transactions received by catchup do not
    break performance monitoring
    """

    client, wallet = client1, wallet1
    lagging_node = get_any_non_primary_node(txnPoolNodeSet)
    rest_nodes = set(txnPoolNodeSet).difference({lagging_node})

    # Make master replica lagging by hiding all messages sent to it
    make_master_replica_lag(lagging_node)
    monkeypatch.setattr(lagging_node.master_replica,
                        '_request_missing_three_phase_messages',
                        lambda *x, **y: None)

    # Send some requests and check that all replicas except master executed it
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5)
    waitNodeDataInequality(looper, lagging_node, *rest_nodes)
    looper.run(eventually(backup_replicas_run_forward, lagging_node))

    # Disconnect lagging node, send some more requests and start it back
    # After start it should fall in a such state that it needs to make catchup
    disconnect_node_and_ensure_disconnected(looper,
                                            txnPoolNodeSet,
                                            lagging_node,
                                            stopNode=False)
    looper.removeProdable(lagging_node)
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5)
    looper.add(lagging_node)
    reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagging_node)

    # Check that catchup done
    waitNodeDataEquality(looper, lagging_node, *rest_nodes)

    # Send some more requests to ensure that backup and master replicas
    # are in the same state
    sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5)
    looper.run(eventually(replicas_synced, lagging_node))

    # Check that master is not considered to be degraded
    assert not lagging_node.monitor.isMasterDegraded()
Beispiel #15
0
def restart_nodes(looper,
                  nodeSet,
                  restart_set,
                  tconf,
                  tdir,
                  allPluginsPath,
                  after_restart_timeout=None,
                  per_add_timeout=None):
    for node_to_stop in restart_set:
        node_to_stop.cleanupOnStopping = True
        node_to_stop.stop()
        looper.removeProdable(node_to_stop)

    rest_nodes = [n for n in nodeSet if n not in restart_set]
    for node_to_stop in restart_set:
        ensure_node_disconnected(looper, node_to_stop, nodeSet, timeout=2)

    if after_restart_timeout:
        looper.runFor(after_restart_timeout)

    for node_to_restart in restart_set:
        config_helper = PNodeConfigHelper(node_to_restart.name,
                                          tconf,
                                          chroot=tdir)
        restarted_node = TestNode(node_to_restart.name,
                                  config_helper=config_helper,
                                  config=tconf,
                                  pluginPaths=allPluginsPath,
                                  ha=node_to_restart.nodestack.ha,
                                  cliha=node_to_restart.clientstack.ha)
        looper.add(restarted_node)
        idx = nodeSet.index(node_to_restart)
        nodeSet[idx] = restarted_node
        if per_add_timeout:
            looper.run(
                checkNodesConnected(rest_nodes + [restarted_node],
                                    customTimeout=per_add_timeout))
        rest_nodes += [restarted_node]

    if not per_add_timeout:
        looper.run(
            checkNodesConnected(nodeSet, customTimeout=after_restart_timeout))
def testProtocolInstanceCannotBecomeActiveWithLessThanFourServers(
        tconf_for_func, tdir_for_func):
    """
    A protocol instance must have at least 4 nodes to come up.
    The status of the nodes will change from starting to started only after the
    addition of the fourth node to the system.
    """
    nodeCount = 13
    f = 4
    minimumNodesToBeUp = nodeCount - f

    nodeNames = genNodeNames(nodeCount)
    with TestNodeSet(tconf_for_func, names=nodeNames,
                     tmpdir=tdir_for_func) as nodeSet:
        with Looper(nodeSet) as looper:

            # helpers

            def genExpectedStates(connecteds: Iterable[str]):
                return {
                    nn: CONNECTED if nn in connecteds else JOINED_NOT_ALLOWED
                    for nn in nodeNames
                }

            def checkNodeStatusRemotesAndF(expectedStatus: Status,
                                           nodeIdx: int):
                for node in nodeSet.nodes.values():
                    checkNodeRemotes(
                        node, genExpectedStates(nodeNames[:nodeIdx + 1]))
                    assert node.status == expectedStatus

            def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status):
                logger.info("Add back the {} node and see status of {}".format(
                    ordinal(nodeIdx + 1), expectedStatus))
                addNodeBack(nodeSet, looper, nodeNames[nodeIdx])

                timeout = waits.expectedNodeStartUpTimeout() + \
                          waits.expectedPoolInterconnectionTime(len(nodeSet))
                # TODO: Probably it's better to modify waits.* functions
                timeout *= 1.5
                looper.run(
                    eventually(checkNodeStatusRemotesAndF,
                               expectedStatus,
                               nodeIdx,
                               retryWait=1,
                               timeout=timeout))

            logger.debug("Sharing keys")
            looper.run(checkNodesConnected(nodeSet))

            logger.debug("Remove all the nodes")
            for n in nodeNames:
                looper.removeProdable(nodeSet.nodes[n])
                nodeSet.removeNode(n)

            looper.runFor(10)

            logger.debug("Add nodes back one at a time")
            for i in range(nodeCount):
                nodes = i + 1
                if nodes < minimumNodesToBeUp:
                    expectedStatus = Status.starting
                elif nodes < nodeCount:
                    expectedStatus = Status.started_hungry
                else:
                    expectedStatus = Status.started
                addNodeBackAndCheck(i, expectedStatus)
Beispiel #17
0
def stop_node(node_to_stop, looper, pool_nodes):
    disconnect_node_and_ensure_disconnected(looper, pool_nodes, node_to_stop)
    looper.removeProdable(node_to_stop)
def test_view_change_after_back_to_quorum_with_disconnected_primary(
        txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir,
        tconf, allPluginsPath):
    assert len(txnPoolNodeSet) == 4

    pr_node = get_master_primary_node(txnPoolNodeSet)
    assert pr_node.name == "Alpha"

    # 1. Initiate view change be primary (Alpha) restart
    nodes = ensure_view_change_by_primary_restart(looper,
                                                  txnPoolNodeSet,
                                                  tconf,
                                                  tdir,
                                                  allPluginsPath,
                                                  customTimeout=2 *
                                                  tconf.VIEW_CHANGE_TIMEOUT)

    # Now primary should be Beta
    pr_node = get_master_primary_node(nodes)
    assert pr_node.name == "Beta"

    # 2. Stop non-primary node Delta, no any view changes are expected
    non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0]
    disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet,
                                            non_primary_to_stop)
    looper.removeProdable(non_primary_to_stop)

    remaining_nodes = list(set(nodes) - {non_primary_to_stop})
    # Primary is going to be stopped, remember instance change messages count
    # to ensure that no view change happened as number of connected nodes is less
    # than quorum.
    ic_cnt = {}
    for n in remaining_nodes:
        ic_cnt[n.name] = n.view_changer.spylog.count(
            ViewChanger.sendInstanceChange.__name__)

    # 3. Disconnect primary
    disconnect_node_and_ensure_disconnected(looper, remaining_nodes, pr_node)
    looper.removeProdable(pr_node)

    # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented.
    looper.runFor(tconf.ToleratePrimaryDisconnection + 5)
    remaining_nodes = list(set(remaining_nodes) - {pr_node})
    for n in remaining_nodes:
        assert ic_cnt[n.name] == n.view_changer.spylog.count(
            ViewChanger.sendInstanceChange.__name__)

    view_no = checkViewNoForNodes(remaining_nodes)

    # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum
    # to choose a new one.
    restartedNode = start_stopped_node(non_primary_to_stop,
                                       looper,
                                       tconf,
                                       tdir,
                                       allPluginsPath,
                                       delay_instance_change_msgs=False)
    remaining_nodes = remaining_nodes + [restartedNode]

    # 5. Check that view change happened.
    waitForViewChange(looper,
                      remaining_nodes,
                      expectedViewNo=(view_no + 1),
                      customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT)

    # ensure pool is working properly
    sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle,
                              sdk_wallet_client, 3)
    ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def test_node_requests_missing_three_phase_messages_after_long_disconnection(
        looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf,
        tdirWithPoolTxns, allPluginsPath):
    """
    2 of 4 nodes go down, so pool can not process any more incoming requests.
    A new request comes in.
    Test than waits for some time to ensure that PrePrepare was created
    long enough seconds to be dropped by time checker.
    Two stopped nodes come back alive.
    Another request comes in.
    Check that previously disconnected two nodes request missing PREPARES and
    PREPREPARES and the pool successfully handles both transactions.
    """
    INIT_REQS_CNT = 10
    MISSING_REQS_CNT = 1
    REQS_AFTER_RECONNECT_CNT = 1
    alive_nodes = []
    disconnected_nodes = []

    for node in txnPoolNodeSet:
        if node.hasPrimary is not None:
            alive_nodes.append(node)
        else:
            disconnected_nodes.append(node)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, INIT_REQS_CNT)

    waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet)
    init_ledger_size = txnPoolNodeSet[0].domainLedger.size

    for node in disconnected_nodes:
        disconnect_node_and_ensure_disconnected(looper,
                                                txnPoolNodeSet,
                                                node,
                                                stopNode=False)
        looper.removeProdable(node)

    sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client,
                             MISSING_REQS_CNT)

    def check_pp_out_of_sync(alive_nodes, disconnected_nodes):
        def get_last_pp(node):
            return node.replicas._master_replica.lastPrePrepare

        last_3pc_key_alive = get_last_pp(alive_nodes[0])
        for node in alive_nodes[1:]:
            assert get_last_pp(node) == last_3pc_key_alive

        last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0])
        assert last_3pc_key_diconnected != last_3pc_key_alive
        for node in disconnected_nodes[1:]:
            assert get_last_pp(node) == last_3pc_key_diconnected

    looper.run(
        eventually(check_pp_out_of_sync,
                   alive_nodes,
                   disconnected_nodes,
                   retryWait=1,
                   timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet))))

    preprepare_deviation = 4
    tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation
    time.sleep(preprepare_deviation * 2)

    for node in disconnected_nodes:
        looper.add(node)
    for node in disconnected_nodes:
        reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node)

    sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle,
                              sdk_wallet_client, REQS_AFTER_RECONNECT_CNT)

    waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet)

    for node in txnPoolNodeSet:
        assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT +
                                          REQS_AFTER_RECONNECT_CNT)