def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdirWithPoolTxns, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() looper.removeProdable(nodeToCrash) looper.runFor(1) looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=35)) looper.runFor(1) node = TestNode(nodeToCrash.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2, timeout=50) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) checkNodesSendingCommits(txnPoolNodeSet)
def test_state_regenerated_from_ledger(looper, txnPoolNodeSet, client1, wallet1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ Node loses its state database but recreates it from ledger after start """ sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 5 * sent_batches, sent_batches) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) node_to_stop = txnPoolNodeSet[-1] node_state = node_to_stop.states[DOMAIN_LEDGER_ID] assert not node_state.isEmpty state_db_path = node_state._kv.db_path nodeHa, nodeCHa = HA(*node_to_stop.nodestack.ha), HA( *node_to_stop.clientstack.ha) node_to_stop.stop() looper.removeProdable(node_to_stop) shutil.rmtree(state_db_path) restarted_node = TestNode(node_to_stop.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(restarted_node) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet viewNoBefore = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary stopNodes([old_pr_node], looper) looper.removeProdable(old_pr_node) remainingNodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remainingNodes, viewNoBefore + 1) ensure_all_nodes_have_same_data(looper, nodes=remainingNodes) new_pr_node = get_master_primary_node(remainingNodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def testChangeHaPersistsPostNodesRestart(looper, txnPoolNodeSet, tdir, tconf, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward): new_steward_wallet, new_node = \ sdk_add_new_steward_and_node(looper, sdk_pool_handle, sdk_wallet_steward, 'AnotherSteward' + randomString(4), 'AnotherNode' + randomString(4), tdir, tconf) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) sdk_pool_refresh(looper, sdk_pool_handle) node_new_ha, client_new_ha = genHa(2) logger.debug("{} changing HAs to {} {}".format(new_node, node_new_ha, client_new_ha)) # Making the change HA txn an confirming its succeeded node_dest = hexToFriendly(new_node.nodestack.verhex) sdk_send_update_node(looper, new_steward_wallet, sdk_pool_handle, node_dest, new_node.name, node_new_ha.host, node_new_ha.port, client_new_ha.host, client_new_ha.port) # Stopping existing nodes for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # Starting nodes again by creating `Node` objects since that simulates # what happens when starting the node with script restartedNodes = [] for node in txnPoolNodeSet[:-1]: config_helper = PNodeConfigHelper(node.name, tconf, chroot=tdir) restartedNode = TestNode(node.name, config_helper=config_helper, config=tconf, ha=node.nodestack.ha, cliha=node.clientstack.ha) looper.add(restartedNode) restartedNodes.append(restartedNode) # Starting the node whose HA was changed config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) node = TestNode(new_node.name, config_helper=config_helper, config=tconf, ha=node_new_ha, cliha=client_new_ha) looper.add(node) restartedNodes.append(node) looper.run(checkNodesConnected(restartedNodes)) waitNodeDataEquality(looper, node, *restartedNodes[:-1]) sdk_pool_refresh(looper, sdk_pool_handle) sdk_ensure_pool_functional(looper, restartedNodes, sdk_wallet_client, sdk_pool_handle)
def test_node_load_after_add_then_disconnect(newNodeCaughtUp, txnPoolNodeSet, tconf, looper, client1, wallet1, client1Connected, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = newNodeCaughtUp with capsys.disabled(): print("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s)) with capsys.disabled(): print("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) new_node = TestNode( new_node.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node # Delay catchup reply processing so LedgerState does not change delay_catchup_reply = 5 new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], customTimeout=timeout) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def stop_primary(looper, active_nodes): stopped_node = active_nodes[0] disconnect_node_and_ensure_disconnected(looper, active_nodes, stopped_node, stopNode=True) looper.removeProdable(stopped_node) active_nodes = active_nodes[1:] return stopped_node, active_nodes
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath, tconf, client1, wallet1, client1Connected): """ Check that quorum f + 1 is used for primary selection when initiated by CurrentState messages. Assumes that view change quorum is n - f. Assumes that primaries selection in round robin fashion. """ # Ensure that we have 4 nodes in total all_nodes = list(txnPoolNodeSet) assert 4 == len(all_nodes) alpha, beta, delta, gamma = all_nodes initial_view_no = alpha.viewNo # Make one node lagging by switching it off for some time lagging_node = gamma non_lagging_nodes = [alpha, beta, delta] disconnect_node_and_ensure_disconnected(looper, all_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Make nodes to perform view change ensure_view_change(looper, non_lagging_nodes) ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes) # Stop two more of active nodes # (but not primary, which is Beta (because of round robin selection)) stopped_nodes = [alpha] # TODO: add one more here for stopped_node in stopped_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, stopped_node, stopNode=True) looper.removeProdable(stopped_node) # Start lagging node back restarted_node = start_stopped_node(lagging_node, looper, tconf, lagging_node.basedirpath, allPluginsPath) active_nodes = [beta, delta, restarted_node] # Check that primary selected expected_view_no = initial_view_no + 1 ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2, customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=1)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals( old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_primary_selection_after_primary_demotion_and_pool_restart( looper, txnPoolNodeSet, stewardAndWalletForMasterNode, txnPoolMasterNodes, tdir, tconf): """ Demote primary and restart the pool. Pool should select new primary and have viewNo=0 after restart. """ logger.info( "1. turn off the node which has primary replica for master instanse") master_node = txnPoolMasterNodes[0] client, wallet = stewardAndWalletForMasterNode node_data = {ALIAS: master_node.name, SERVICES: []} updateNodeData(looper, client, wallet, master_node, node_data) restNodes = [ node for node in txnPoolNodeSet if node.name != master_node.name ] ensureElectionsDone(looper, restNodes) # ensure pool is working properly sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=3) logger.info("2. restart pool") # Stopping existing nodes for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # Starting nodes again by creating `Node` objects since that simulates # what happens when starting the node with script restartedNodes = [] for node in txnPoolNodeSet: config_helper = PNodeConfigHelper(node.name, tconf, chroot=tdir) restartedNode = TestNode(node.name, config_helper=config_helper, config=tconf, ha=node.nodestack.ha, cliha=node.clientstack.ha) looper.add(restartedNode) restartedNodes.append(restartedNode) restNodes = [ node for node in restartedNodes if node.name != master_node.name ] looper.run(checkNodesConnected(restNodes)) ensureElectionsDone(looper, restNodes) checkViewNoForNodes(restNodes, 0) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=3) primariesIdxs = getPrimaryNodesIdxs(restNodes) assert restNodes[primariesIdxs[0]].name != master_node.name
def start_stop_one_node(node_to_restart, pool_of_nodes): """ :param node_to_restart: node, which would be restarted :param pool_of_nodes: current pool :return: new pool with restarted node Node restart procedure consist of: 1. Calling stop() 2. Remove from looper and pool 3. Create new instance of node with the same ha, cliha and node_name (also all path to data, keys and etc would be exactly as for stopped node) 4. Add new instance into looper and pool 5. Check, that other nodes accepted new instance and all pool has the same data """ remaining_nodes = list(set(pool_of_nodes) - {node_to_restart}) disconnect_node_and_ensure_disconnected(looper, pool_of_nodes, node_to_restart, stopNode=True) looper.removeProdable(node_to_restart) ensure_all_nodes_have_same_data( looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 1) node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=True) pool_of_nodes = remaining_nodes + [node_to_restart] looper.run(checkNodesConnected(pool_of_nodes)) ensure_all_nodes_have_same_data( looper, pool_of_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes)) looper.run( eventually(check_ledger_state, node_to_restart, DOMAIN_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) looper.run( eventually(check_ledger_state, node_to_restart, POOL_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) looper.run(eventually(catchuped, node_to_restart, timeout=2 * timeout)) return pool_of_nodes
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdir, allPluginsPath, tconf, wallet1, client1): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) ensure_view_change(looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals( restarted_node.view_changer, restarted_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) assert not restarted_node.view_changer._next_view_indications
def test_node_load_after_disconnect(looper, txnPoolNodeSet, tconf, tdirWithPoolTxns, allPluginsPath, poolTxnStewardData, capsys): client, wallet = buildPoolClientAndWallet(poolTxnStewardData, tdirWithPoolTxns, clientClass=TestClient) looper.add(client) looper.run(client.ensureConnectedToNodes()) nodes = txnPoolNodeSet x = nodes[-1] with capsys.disabled(): print("Stopping node {} with pool ledger size {}".format( x, x.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, x) looper.removeProdable(x) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, txns_per_batch, override_timeout_limit=True) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'.format( i + 1, txns_per_batch, perf_counter() - s)) nodeHa, nodeCHa = HA(*x.nodestack.ha), HA(*x.clientstack.ha) newNode = TestNode(x.name, basedirpath=tdirWithPoolTxns, base_data_dir=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode looper.run(checkNodesConnected(txnPoolNodeSet))
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdir, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() logger.debug('Stopped node {}'.format(nodeToCrash)) looper.removeProdable(nodeToCrash) looper.runFor(1) stopNodes([nodeToCrash], looper) # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=60)) looper.runFor(1) config_helper = PNodeConfigHelper(nodeToCrash.name, tconf, chroot=tdir) node = TestNode(nodeToCrash.name, ledger_dir=config_helper.ledger_dir, keys_dir=config_helper.keys_dir, genesis_dir=config_helper.genesis_dir, plugins_dir=config_helper.plugins_dir, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10)
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, client1, wallet1, client1Connected, monkeypatch): """ Checks that transactions received by catchup do not break performance monitoring """ client, wallet = client1, wallet1 lagging_node = get_any_non_primary_node(txnPoolNodeSet) rest_nodes = set(txnPoolNodeSet).difference({lagging_node}) # Make master replica lagging by hiding all messages sent to it make_master_replica_lag(lagging_node) monkeypatch.setattr(lagging_node.master_replica, '_request_missing_three_phase_messages', lambda *x, **y: None) # Send some requests and check that all replicas except master executed it sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) waitNodeDataInequality(looper, lagging_node, *rest_nodes) looper.run(eventually(backup_replicas_run_forward, lagging_node)) # Disconnect lagging node, send some more requests and start it back # After start it should fall in a such state that it needs to make catchup disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=False) looper.removeProdable(lagging_node) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.add(lagging_node) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagging_node) # Check that catchup done waitNodeDataEquality(looper, lagging_node, *rest_nodes) # Send some more requests to ensure that backup and master replicas # are in the same state sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.run(eventually(replicas_synced, lagging_node)) # Check that master is not considered to be degraded assert not lagging_node.monitor.isMasterDegraded()
def restart_nodes(looper, nodeSet, restart_set, tconf, tdir, allPluginsPath, after_restart_timeout=None, per_add_timeout=None): for node_to_stop in restart_set: node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) rest_nodes = [n for n in nodeSet if n not in restart_set] for node_to_stop in restart_set: ensure_node_disconnected(looper, node_to_stop, nodeSet, timeout=2) if after_restart_timeout: looper.runFor(after_restart_timeout) for node_to_restart in restart_set: config_helper = PNodeConfigHelper(node_to_restart.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_restart.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_restart.nodestack.ha, cliha=node_to_restart.clientstack.ha) looper.add(restarted_node) idx = nodeSet.index(node_to_restart) nodeSet[idx] = restarted_node if per_add_timeout: looper.run( checkNodesConnected(rest_nodes + [restarted_node], customTimeout=per_add_timeout)) rest_nodes += [restarted_node] if not per_add_timeout: looper.run( checkNodesConnected(nodeSet, customTimeout=after_restart_timeout))
def testProtocolInstanceCannotBecomeActiveWithLessThanFourServers( tconf_for_func, tdir_for_func): """ A protocol instance must have at least 4 nodes to come up. The status of the nodes will change from starting to started only after the addition of the fourth node to the system. """ nodeCount = 13 f = 4 minimumNodesToBeUp = nodeCount - f nodeNames = genNodeNames(nodeCount) with TestNodeSet(tconf_for_func, names=nodeNames, tmpdir=tdir_for_func) as nodeSet: with Looper(nodeSet) as looper: # helpers def genExpectedStates(connecteds: Iterable[str]): return { nn: CONNECTED if nn in connecteds else JOINED_NOT_ALLOWED for nn in nodeNames } def checkNodeStatusRemotesAndF(expectedStatus: Status, nodeIdx: int): for node in nodeSet.nodes.values(): checkNodeRemotes( node, genExpectedStates(nodeNames[:nodeIdx + 1])) assert node.status == expectedStatus def addNodeBackAndCheck(nodeIdx: int, expectedStatus: Status): logger.info("Add back the {} node and see status of {}".format( ordinal(nodeIdx + 1), expectedStatus)) addNodeBack(nodeSet, looper, nodeNames[nodeIdx]) timeout = waits.expectedNodeStartUpTimeout() + \ waits.expectedPoolInterconnectionTime(len(nodeSet)) # TODO: Probably it's better to modify waits.* functions timeout *= 1.5 looper.run( eventually(checkNodeStatusRemotesAndF, expectedStatus, nodeIdx, retryWait=1, timeout=timeout)) logger.debug("Sharing keys") looper.run(checkNodesConnected(nodeSet)) logger.debug("Remove all the nodes") for n in nodeNames: looper.removeProdable(nodeSet.nodes[n]) nodeSet.removeNode(n) looper.runFor(10) logger.debug("Add nodes back one at a time") for i in range(nodeCount): nodes = i + 1 if nodes < minimumNodesToBeUp: expectedStatus = Status.starting elif nodes < nodeCount: expectedStatus = Status.started_hungry else: expectedStatus = Status.started addNodeBackAndCheck(i, expectedStatus)
def stop_node(node_to_stop, looper, pool_nodes): disconnect_node_and_ensure_disconnected(looper, pool_nodes, node_to_stop) looper.removeProdable(node_to_stop)
def test_view_change_after_back_to_quorum_with_disconnected_primary( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): assert len(txnPoolNodeSet) == 4 pr_node = get_master_primary_node(txnPoolNodeSet) assert pr_node.name == "Alpha" # 1. Initiate view change be primary (Alpha) restart nodes = ensure_view_change_by_primary_restart(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # Now primary should be Beta pr_node = get_master_primary_node(nodes) assert pr_node.name == "Beta" # 2. Stop non-primary node Delta, no any view changes are expected non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, non_primary_to_stop) looper.removeProdable(non_primary_to_stop) remaining_nodes = list(set(nodes) - {non_primary_to_stop}) # Primary is going to be stopped, remember instance change messages count # to ensure that no view change happened as number of connected nodes is less # than quorum. ic_cnt = {} for n in remaining_nodes: ic_cnt[n.name] = n.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__) # 3. Disconnect primary disconnect_node_and_ensure_disconnected(looper, remaining_nodes, pr_node) looper.removeProdable(pr_node) # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented. looper.runFor(tconf.ToleratePrimaryDisconnection + 5) remaining_nodes = list(set(remaining_nodes) - {pr_node}) for n in remaining_nodes: assert ic_cnt[n.name] == n.view_changer.spylog.count( ViewChanger.sendInstanceChange.__name__) view_no = checkViewNoForNodes(remaining_nodes) # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum # to choose a new one. restartedNode = start_stopped_node(non_primary_to_stop, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) remaining_nodes = remaining_nodes + [restartedNode] # 5. Check that view change happened. waitForViewChange(looper, remaining_nodes, expectedViewNo=(view_no + 1), customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle, sdk_wallet_client, 3) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def test_node_requests_missing_three_phase_messages_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary is not None: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=False) looper.removeProdable(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: looper.add(node) for node in disconnected_nodes: reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)