def testAdd2NewNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, tconf, steward1, stewardWallet, allPluginsPath): """ Add 2 new nodes to trigger replica addition and primary election """ for nodeName in ("Zeta", "Eta"): newStewardName = "testClientSteward"+randomString(3) newSteward, newStewardWallet, newNode = addNewStewardAndNode(looper, steward1, stewardWallet, newStewardName, nodeName, tdirWithPoolTxns, tconf, allPluginsPath) txnPoolNodeSet.append(newNode) looper.run(checkNodesConnected(txnPoolNodeSet)) logger.debug("{} connected to the pool".format(newNode)) looper.run(eventually(checkNodeLedgersForEquality, newNode, *txnPoolNodeSet[:-1], retryWait=1, timeout=7)) f = getMaxFailures(len(txnPoolNodeSet)) def checkFValue(): for node in txnPoolNodeSet: assert node.f == f assert len(node.replicas) == (f + 1) looper.run(eventually(checkFValue, retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1, timeout=5)
def testPrimarySelectionAfterViewChange( # noqa looper, txnPoolNodeSet, primaryReplicas, catchup_complete_count): """ Test that primary replica of a protocol instance shifts to a new node after a view change. """ # TODO: This test can fail due to view change. ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_complete_count[n.name] # Primary replicas before view change prBeforeVC = primaryReplicas # Primary replicas after view change instanceCount = getNoInstances(nodeCount) prAfterVC = [getPrimaryReplica(txnPoolNodeSet, i) for i in range(instanceCount)] # Primary replicas have moved to the next node for br, ar in zip(prBeforeVC, prAfterVC): assert ar.node.rank - br.node.rank == 1 check_rank_consistent_across_each_node(txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
def testNodesConnectsWhenOneNodeIsLate(allPluginsPath, tdirAndLooper, nodeReg, conf): tdir, looper = tdirAndLooper nodes = [] names = list(nodeReg.keys()) logger.debug("Node names: {}".format(names)) def create(name): node = TestNode(name, nodeReg, basedirpath=tdir, pluginPaths=allPluginsPath) looper.add(node) nodes.append(node) # TODO: This will be moved to a fixture if conf.UseZStack: genKeys(tdir, names + [_ + CLIENT_STACK_SUFFIX for _ in names]) for name in names[:3]: create(name) looper.run(checkNodesConnected(nodes)) # wait for the election to complete with the first three nodes looper.runFor(10) # create the fourth and see that it learns who the primaries are # from the other nodes create(names[3]) checkProtocolInstanceSetup(looper, nodes, timeout=10) stopNodes(nodes, looper)
def testElectionsAfterViewChange(delayedPerf, looper: Looper, nodeSet: TestNodeSet, up, wallet1, client1): """ Test that a primary election does happen after a view change """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls # and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) # Ensure view change happened for both node and its primary elector for node in nodeSet: looper.run( eventually(partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Ensure elections are done again and pool is setup again with appropriate # protocol instances and each protocol instance is setup properly too checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=30)
def test_no_propagated_future_view_change_while_view_change( txnPoolNodeSet, looper): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node = txnPoolNodeSet[-1] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate view change in progress lagged_node.view_changer.view_change_in_progress = True old_view_no = checkViewNoForNodes([lagged_node]) initial_vhdc = \ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # check that lagged node recived 3 Future VCD, but didn't start new view change assert len(other_nodes) + initial_vhdc ==\ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) assert old_view_no == checkViewNoForNodes([lagged_node])
def elections_done(case_6_setup, looper, txnPoolNodeSet): # Make sure elections are done successfully A, B, C, D = txnPoolNodeSet looper.run(checkNodesConnected(txnPoolNodeSet)) inst_ids = (0, 1) def chk(): # Check that each Primary is received by A before A has sent any # Primary primary_recv_times = { i: [entry.starttime for entry in A.elector.spylog.getAll( A.elector.processPrimary) if entry.params['prim'].instId == i] for i in inst_ids } primary_send_times = { i: [entry.starttime for entry in A.elector.spylog.getAll( A.elector.sendPrimary) if entry.params['instId'] == 0] for i in inst_ids } for i in inst_ids: assert primary_send_times[i][0] > max(primary_recv_times[i]) looper.run(eventually(chk, retryWait=1, timeout=15)) checkProtocolInstanceSetup(looper=looper, nodes=txnPoolNodeSet, retryWait=1) # Make sure no Nominations or Primary are received by A from B for i in inst_ids: assert B.replicas[i].name not in A.elector.nominations[i] assert B.replicas[i].name not in A.elector.primaryDeclarations[i]
def test_different_ledger_request_interleave(tconf, looper, txnPoolNodeSet, client1, wallet1, one_node_added, client1Connected, tdir, client_tdir, tdirWithPoolTxns, steward1, stewardWallet, allPluginsPath): """ Send pool and domain ledger requests such that they interleave, and do view change in between and verify the pool is functional """ new_node = one_node_added sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # Send domain ledger requests but don't wait for replies requests = sendRandomRequests(wallet1, client1, 2) # Add another node by sending pool ledger request _, _, new_theta = nodeThetaAdded(looper, txnPoolNodeSet, tdir, client_tdir, tconf, steward1, stewardWallet, allPluginsPath, name='new_theta') # Send more domain ledger requests but don't wait for replies requests.extend(sendRandomRequests(wallet1, client1, 3)) # Do view change without waiting for replies ensure_view_change(looper, nodes=txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) # Make sure all requests are completed waitForSufficientRepliesForRequests(looper, client1, requests=requests) ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1) new_steward, new_steward_wallet = addNewSteward(looper, client_tdir, steward1, stewardWallet, 'another_ste') # Send another pool ledger request (NODE) but don't wait for completion of # request next_node_name = 'next_node' r = sendAddNewNode(tdir, tconf, next_node_name, new_steward, new_steward_wallet) node_req = r[0] # Send more domain ledger requests but don't wait for replies requests = [ node_req, *sendRandomRequests(new_steward_wallet, new_steward, 5) ] # Make sure all requests are completed waitForSufficientRepliesForRequests(looper, new_steward, requests=requests) # Make sure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def testAdd2NewNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, tconf, steward1, stewardWallet, allPluginsPath): """ Add 2 new nodes to trigger replica addition and primary election """ for nodeName in ("Zeta", "Eta"): newStewardName = "testClientSteward" + randomString(3) newSteward, newStewardWallet, newNode = addNewStewardAndNode( looper, steward1, stewardWallet, newStewardName, nodeName, tdirWithPoolTxns, tconf, allPluginsPath) txnPoolNodeSet.append(newNode) looper.run(checkNodesConnected(txnPoolNodeSet, overrideTimeout=30)) logger.debug("{} connected to the pool".format(newNode)) looper.run( eventually(checkNodeLedgersForEquality, newNode, *txnPoolNodeSet[:-1], retryWait=1, timeout=7)) f = getMaxFailures(len(txnPoolNodeSet)) def checkFValue(): for node in txnPoolNodeSet: assert node.f == f assert len(node.replicas) == (f + 1) looper.run(eventually(checkFValue, retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1, timeout=5)
def testNodesConnectsWhenOneNodeIsLate(allPluginsPath, tdirAndLooper, nodeReg): tdir, looper = tdirAndLooper nodes = [] names = list(nodeReg.keys()) logger.debug("Node names: {}".format(names)) def create(name): node = TestNode(name, nodeReg, basedirpath=tdir, pluginPaths=allPluginsPath) looper.add(node) node.startKeySharing() nodes.append(node) for name in names[:3]: create(name) looper.run(checkNodesConnected(nodes)) # wait for the election to complete with the first three nodes looper.runFor(10) # create the fourth and see that it learns who the primaries are # from the other nodes create(names[3]) checkProtocolInstanceSetup(looper, nodes, timeout=10) stopNodes(nodes, looper)
def elections_done(case_6_setup, looper, keySharedNodes): # Make sure elections are done successfully nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() looper.run(checkNodesConnected(nodeSet)) inst_ids = (0, 1) def chk(): # Check that each Primary is received by A before A has sent any # Primary primary_recv_times = { i: [entry.starttime for entry in A.elector.spylog.getAll( A.elector.processPrimary) if entry.params['prim'].instId == i] for i in inst_ids } primary_send_times = { i: [entry.starttime for entry in A.elector.spylog.getAll( A.elector.sendPrimary) if entry.params['instId'] == 0] for i in inst_ids } for i in inst_ids: assert primary_send_times[i][0] > max(primary_recv_times[i]) looper.run(eventually(chk, retryWait=1, timeout=15)) checkProtocolInstanceSetup(looper=looper, nodes=nodeSet, retryWait=1) # Make sure no Nominations or Primary are received by A from B for i in inst_ids: assert B.replicas[i].name not in A.elector.nominations[i] assert B.replicas[i].name not in A.elector.primaryDeclarations[i]
def testPrimaryElectionWithTie(electTieFixture, looper, keySharedNodes): """ Primary selection (Rainy Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes, and then sends Nominate(A) At the same exact time, B sees the minimum number of nodes, and then sends out Nominate(B) A sees B sending Nominate(B), but it has already nominated itself, so it does nothing B sees A sending Nominate(A), but it has already nominated itself, so it does nothing C sees A sending Nominate(A), and sends Nominate(A) D sees B sending Nominate(B), and sends Nominate(B) There's a split. C and A think A is the primary, B and D think B is the primary All nodes can see that there is a split. Each sends out Reelection([A,B]) A and B both see Reelection([A,B]) from themselves as well as the other 3 (the number from others should be at least f+1), 1. they wait a random amount of time (between 0 and 2 seconds), 2. they each send out a Nominate(self) Voting is repeated until we have a good election. """ # TODO optimize the sending messages in batches, for example, we don't # send messages more often than 400 milliseconds. Once those 400 # millis have passed, we send the several queued messages in one # batch. nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() checkPoolReady(looper, nodeSet.nodes.values()) for node in nodeSet.nodes.values(): for instId, replica in enumerate(node.elector.replicas): logger.debug("replica {} {} with votes {}". format(replica.name, replica.instId, node.elector.nominations.get(instId, {}))) logger.debug("Check nomination") # Checking whether Node A nominated itself looper.run(eventually(checkNomination, A, A.name, retryWait=1, timeout=10)) # Checking whether Node B nominated itself looper.run(eventually(checkNomination, B, B.name, retryWait=1, timeout=10)) # Checking whether Node C nominated Node A looper.run(eventually(checkNomination, C, A.name, retryWait=1, timeout=10)) # Checking whether Node D nominated Node D looper.run(eventually(checkNomination, D, B.name, retryWait=1, timeout=10)) # No node should be primary for node in nodeSet.nodes.values(): assert node.hasPrimary is False for node in nodeSet.nodes.values(): node.resetDelays() checkProtocolInstanceSetup(looper=looper, nodes=nodeSet, retryWait=1, timeout=60)
def testPrimarySelectionAfterViewChange( # noqa looper, txnPoolNodeSet, primaryReplicas, catchup_complete_count, view_change_done): """ Test that primary replica of a protocol instance shifts to a new node after a view change. """ # TODO: This test can fail due to view change. for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_complete_count[n.name] # Primary replicas before view change prBeforeVC = primaryReplicas # Primary replicas after view change instanceCount = getNoInstances(nodeCount) prAfterVC = [ getPrimaryReplica(txnPoolNodeSet, i) for i in range(instanceCount) ] # Primary replicas have moved to the next node for br, ar in zip(prBeforeVC, prAfterVC): assert ar.node.rank - br.node.rank == 1 check_rank_consistent_across_each_node(txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
def test_catchup_to_next_view_during_view_change_0_to_2( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' 1) Lagging node is not a primary for new views 2) All nodes except the lagging one go to view=1 3) All nodes except the lagging one order txns on view=1 4) All nodes except the lagging one go to view=2 5) All nodes except the lagging one order txns on view=2 6) Lagging node gets InstanceChanges for view=1 and view=2 => it changes to view=2, and catches up till txns from view=2 7) Make sure that the lagging node is up to date, and can participate in consensus ''' lagging_node = txnPoolNodeSet[0] other_nodes = txnPoolNodeSet[1:] initial_view_no = checkViewNoForNodes(txnPoolNodeSet) initial_last_ordered = lagging_node.master_last_ordered_3PC with delay_rules(lagging_node.nodeIbStasher, delay_for_view(viewNo=0), delay_for_view(viewNo=1), delay_for_view(viewNo=2)): # view change to viewNo=1 trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, other_nodes, expectedViewNo=initial_view_no + 1) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(3)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # order some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # view change to viewNo=2 trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, other_nodes, expectedViewNo=initial_view_no + 2) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(3)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # order some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) assert initial_view_no == lagging_node.viewNo assert initial_last_ordered == lagging_node.master_last_ordered_3PC # make sure that the second View Change happened on the lagging node waitForViewChange(looper, [lagging_node], expectedViewNo=initial_view_no + 2, customTimeout=20) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # make sure that the pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def caught_up_for_current_view_count(): return len( getAllReturnVals(bad_node, bad_node.caught_up_for_current_view, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = caught_up_for_current_view_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 # The bad_node caught up due to receiving sufficient ViewChangeDone # messages assert caught_up_for_current_view_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def check_newly_added_nodes(looper, all_nodes, new_nodes): # New nodes should be give in the order they were added assert [n in all_nodes for n in new_nodes] check_rank_consistent_across_each_node(all_nodes) old_nodes = [node for node in all_nodes if node not in new_nodes] for new_node in new_nodes: assert all(new_node.rank > n.rank for n in old_nodes) old_nodes.append(new_node) checkProtocolInstanceSetup(looper, all_nodes, retryWait=1)
def check_newly_added_nodes(looper, all_nodes, new_nodes): # New nodes should be give in the order they were added assert [n in all_nodes for n in new_nodes] check_rank_consistent_across_each_node(all_nodes) old_nodes = [node for node in all_nodes if node not in new_nodes] for new_node in new_nodes: assert all(new_node.rank > n.rank for n in old_nodes) old_nodes.append(new_node) checkProtocolInstanceSetup(looper, all_nodes, retryWait=1)
def test_catchup_to_next_view_during_view_change_by_primary( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' 1) Lagging node is a primary for view=1 2) All nodes except the lagging one start a view change (to view=1) 3) The nodes can not finish it on time since the Primary for view=1 is lagging 4) All nodes except the lagging one go to view=2 then 5) All nodes except the lagging one order txns on view=2 6) Lagging node gets InstanceChanges for view=1 => it changes to view=2, and catches up till txns from view=2 7) Lagging node gets InstanceChanges for view=2 => it changes to view=2 8) Make sure that the lagging node is up to date, and can participate in consensus ''' lagging_node = txnPoolNodeSet[1] other_nodes = list(set(txnPoolNodeSet) - {lagging_node}) initial_view_no = checkViewNoForNodes(txnPoolNodeSet) initial_last_ordered = lagging_node.master_last_ordered_3PC with delay_rules(lagging_node.nodeIbStasher, delay_for_view(viewNo=2)): with delay_rules(lagging_node.nodeIbStasher, delay_for_view(viewNo=0), delay_for_view(viewNo=1)): # view change to viewNo=2 since a primary for viewNo=1 is a lagging node for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, other_nodes, expectedViewNo=initial_view_no + 2, customTimeout=30) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(3)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # order some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) assert initial_view_no == lagging_node.viewNo assert initial_last_ordered == lagging_node.master_last_ordered_3PC assert len(lagging_node.master_replica._ordering_service. requestQueues[DOMAIN_LEDGER_ID]) > 0 # make sure that the first View Change happened on lagging node waitForViewChange(looper, [lagging_node], expectedViewNo=initial_view_no + 1, customTimeout=20) assert initial_view_no + 1 == lagging_node.viewNo # make sure that the second View Change happened on lagging node waitForViewChange(looper, [lagging_node], expectedViewNo=initial_view_no + 2, customTimeout=20) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # make sure that the pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def testPrimaryElectionWithAClearWinner(electContFixture, looper, keySharedNodes): """ Primary selection (Sunny Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes first, and then sends out a NOMINATE(A) message B, C, D all see the NOMINATE(A) message from A, and respond with NOMINATE(A) message to all other nodes A sees three other NOMINATE(A) votes (from B, C, D) A sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes B sees two more NOMINATE(A) votes (from C and D) B sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes C sees two more NOMINATE(A) votes (from B and D) C sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes D sees two more NOMINATE(A) votes (from B and C) D sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes A sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary B sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary C sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary D sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary """ nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() nodesBCD = [B, C, D] checkPoolReady(looper, nodeSet) # Checking whether one of the replicas of Node A nominated itself timeout = waits.expectedPoolNominationTimeout(len(nodeSet)) looper.run( eventually(checkNomination, A, A.name, retryWait=1, timeout=timeout)) timeout = waits.expectedPoolNominationTimeout(len(nodeSet)) for n in nodesBCD: # Checking whether Node B, C and D nominated Node A looper.run( eventually(checkNomination, n, A.name, retryWait=1, timeout=timeout)) checkProtocolInstanceSetup(looper=looper, nodes=nodeSet, retryWait=1) assert A.hasPrimary
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([ 1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID ]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def testNodeDiscardMessageFromUnknownView(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, newNodeCaughtUp, tdirWithPoolTxns, tconf, allPluginsPath): """ Node discards 3-phase and election messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, nodeX, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns viewNo = nodeX.viewNo # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 4) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, viewNo + 1), retryWait=1, timeout=20)) newStewardName = "testClientSteward" + randomString(3) nodeName = "Theta" _, _, nodeTheta = addNewStewardAndNode(looper, client, wallet, newStewardName, nodeName, tdirWithPoolTxns, tconf, allPluginsPath) txnPoolNodeSet.append(nodeTheta) looper.run(checkNodesConnected(txnPoolNodeSet)) looper.run(client.ensureConnectedToNodes()) looper.run(eventually(checkNodeLedgersForEquality, nodeTheta, *txnPoolNodeSet[:-1], retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1, timeout=10) electMsg = Nomination(nodeX.name, 0, viewNo) threePMsg = PrePrepare( 0, viewNo, 10, wallet.defaultId, wallet._getIdData().lastReqId+1, "random digest", time.time() ) ridTheta = nodeX.nodestack.getRemote(nodeTheta.name).uid nodeX.send(electMsg, ridTheta) nodeX.send(threePMsg, ridTheta) nodeX.send(electMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], electMsg, 'un-acceptable viewNo', retryWait=1, timeout=5)) nodeX.send(threePMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], threePMsg, 'un-acceptable viewNo', retryWait=1, timeout=5))
def test_view_change_without_primary(nodeSet, looper, patched_view_change_timeout): first, others = stop_nodes_and_remove_first(looper, nodeSet) start_and_connect_nodes(looper, others) timeout = waits.expectedPoolElectionTimeout(len(nodeSet)) + patched_view_change_timeout checkProtocolInstanceSetup(looper=looper, nodes=others, retryWait=1, customTimeout=timeout, numInstances=getRequiredInstances(len(nodeSet)))
def testNodeDiscardMessageFromUnknownView(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, newNodeCaughtUp, tdirWithPoolTxns, tconf, allPluginsPath): """ Node discards 3-phase and election messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, nodeX, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns viewNo = nodeX.viewNo # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 4) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, viewNo + 1), retryWait=1, timeout=20)) newStewardName = "testClientSteward" + randomString(3) nodeName = "Theta" _, _, nodeTheta = addNewStewardAndNode(looper, client, wallet, newStewardName, nodeName, tdirWithPoolTxns, tconf, allPluginsPath) txnPoolNodeSet.append(nodeTheta) looper.run(checkNodesConnected(txnPoolNodeSet)) looper.run(client.ensureConnectedToNodes()) looper.run(eventually(checkNodeLedgersForEquality, nodeTheta, *txnPoolNodeSet[:-1], retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1, timeout=10) electMsg = Nomination(nodeX.name, 0, viewNo) threePMsg = PrePrepare( 0, viewNo, 10, wallet.defaultId, wallet._getIdData().lastReqId+1, "random digest", time.time() ) ridTheta = nodeX.nodestack.getRemote(nodeTheta.name).uid nodeX.send(electMsg, ridTheta) nodeX.send(threePMsg, ridTheta) nodeX.send(electMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], electMsg, 'un-acceptable viewNo', retryWait=1, timeout=5)) nodeX.send(threePMsg, ridTheta) looper.run(eventually(checkDiscardMsg, [nodeTheta, ], threePMsg, 'un-acceptable viewNo', retryWait=1, timeout=5))
def test_view_change_without_primary(txnPoolNodeSet, looper, patched_view_change_timeout): first, others = stop_nodes_and_remove_first(looper, txnPoolNodeSet) start_and_connect_nodes(looper, others) timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) + patched_view_change_timeout #looper.runFor(40) checkProtocolInstanceSetup(looper=looper, nodes=txnPoolNodeSet, retryWait=1, customTimeout=timeout, instances=range(getRequiredInstances(len(txnPoolNodeSet))))
def testPrimaryElectionWithAClearWinner(electContFixture, looper, keySharedNodes): """ Primary selection (Sunny Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes first, and then sends out a NOMINATE(A) message B, C, D all see the NOMINATE(A) message from A, and respond with NOMINATE(A) message to all other nodes A sees three other NOMINATE(A) votes (from B, C, D) A sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes B sees two more NOMINATE(A) votes (from C and D) B sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes C sees two more NOMINATE(A) votes (from B and D) C sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes D sees two more NOMINATE(A) votes (from B and C) D sees that A is the clear winner (2f+1 total), and sends PRIMARY(A) to all nodes A sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary B sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary C sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary D sees at least two other PRIMARY(A) votes (3 including it's own) selects A as primary """ nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() nodesBCD = [B, C, D] checkPoolReady(looper, nodeSet) # Checking whether one of the replicas of Node A nominated itself looper.run(eventually(checkNomination, A, A.name, retryWait=1, timeout=10)) for n in nodesBCD: # Checking whether Node B, C and D nominated Node A looper.run(eventually(checkNomination, n, A.name, retryWait=1, timeout=10)) checkProtocolInstanceSetup(looper=looper, nodes=nodeSet, retryWait=1, timeout=10) assert A.hasPrimary
def testNodeDiscardMessageFromUnknownView(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, newNodeCaughtUp, tdirWithPoolTxns, tconf, allPluginsPath): """ Node discards 3-phase or ViewChangeDone messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, nodeX, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns viewNo = nodeX.viewNo # Force two view changes: node discards msgs which have viewNo # at least two less than node's. Current protocol implementation # needs to hold messages from the previous view as well as # from the current view. for i in range(2): ensure_view_change(looper, txnPoolNodeSet) waitNodeDataEquality(looper, nodeX, *txnPoolNodeSet[:-1]) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sender = txnPoolNodeSet[0] rid_x_node = sender.nodestack.getRemote(nodeX.name).uid messageTimeout = waits.expectedNodeToNodeMessageDeliveryTime() # 3 pc msg (PrePrepare) needs to be discarded primaryRepl = getPrimaryReplica(txnPoolNodeSet) three_pc = PrePrepare( 0, viewNo, 10, time.time(), [[wallet.defaultId, wallet._getIdData().lastReqId + 1]], 1, "random digest", DOMAIN_LEDGER_ID, primaryRepl.stateRootHash(DOMAIN_LEDGER_ID), primaryRepl.txnRootHash(DOMAIN_LEDGER_ID), ) sender.send(three_pc, rid_x_node) looper.run( eventually(checkDiscardMsg, [ nodeX, ], three_pc, 'un-acceptable viewNo', retryWait=1, timeout=messageTimeout))
def testPrimaryElectionContested(electContFixture, looper, txnPoolNodeSet): """ Primary selection (Rainy Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes, and then sends Nominate(A) At the same exact time, B sees the minimum number of nodes, and then sends out Nominate(B) A sees B sending Nominate(B), but it has already nominated itself, so it does nothing B sees A sending Nominate(A), but it has already nominated itself, so it does nothing C sees A sending Nominate(A), and sends Nominate(A) D sees A sending Nominate(A), and sends Nominate(A) All nodes see that B nominated B and A, C, and D all nominated A Because the votes for A exceeds the votes for B, all send out Primary(A) TODO's (see below) All see the others have sent Primary A, and then the nodes record who is the Primary. """ A, B, C, D = txnPoolNodeSet checkPoolReady(looper, txnPoolNodeSet) logger.debug("Check nomination") timeout = waits.expectedPoolNominationTimeout(nodeCount) # Checking whether Node A nominated itself looper.run( eventually(checkNomination, A, A.name, retryWait=1, timeout=timeout)) # Checking whether Node B nominated itself looper.run( eventually(checkNomination, B, B.name, retryWait=1, timeout=timeout)) for n in [C, D]: # Checking whether Node C and Node D nominated Node A looper.run( eventually(checkNomination, n, A.name, retryWait=1, timeout=timeout)) checkProtocolInstanceSetup(looper=looper, nodes=txnPoolNodeSet, retryWait=1) # Node D should not be primary assert not D.hasPrimary # A should have at least one primary assert A.hasPrimary
def testNodeDiscardMessageFromUnknownView( txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns, sdk_new_node_caught_up, allPluginsPath, wallet1): """ Node discards 3-phase or ViewChangeDone messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns viewNo = new_node.viewNo # Force two view changes: node discards msgs which have viewNo # at least two less than node's. Current protocol implementation # needs to hold messages from the previous view as well as # from the current view. for i in range(2): ensure_view_change(looper, txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sender = txnPoolNodeSet[0] rid_x_node = sender.nodestack.getRemote(new_node.name).uid messageTimeout = waits.expectedNodeToNodeMessageDeliveryTime() # 3 pc msg (PrePrepare) needs to be discarded primaryRepl = getPrimaryReplica(txnPoolNodeSet) three_pc = PrePrepare( 0, viewNo, 10, get_utc_epoch(), [[wallet1.defaultId, Request.gen_req_id()]], 1, "random digest", DOMAIN_LEDGER_ID, primaryRepl.stateRootHash(DOMAIN_LEDGER_ID), primaryRepl.txnRootHash(DOMAIN_LEDGER_ID), ) sender.send(three_pc, rid_x_node) looper.run( eventually(checkDiscardMsg, [ new_node, ], three_pc, 'un-acceptable viewNo', retryWait=1, timeout=messageTimeout))
def test_view_change_without_primary(txnPoolNodeSet, looper, tconf): first, others = stop_nodes_and_remove_first(looper, txnPoolNodeSet) start_and_connect_nodes(looper, others) timeout = waits.expectedPoolElectionTimeout( len(txnPoolNodeSet)) + tconf.NEW_VIEW_TIMEOUT # looper.runFor(40) checkProtocolInstanceSetup(looper=looper, nodes=txnPoolNodeSet, retryWait=1, customTimeout=timeout, instances=range( getRequiredInstances(len(txnPoolNodeSet))))
def testNodeDiscardMessageFromUnknownView(txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns, sdk_new_node_caught_up, allPluginsPath, sdk_wallet_client): """ Node discards 3-phase or ViewChangeDone messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns viewNo = new_node.viewNo # Force two view changes: node discards msgs which have viewNo # at least two less than node's. Current protocol implementation # needs to hold messages from the previous view as well as # from the current view. for i in range(2): ensure_view_change(looper, txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sender = txnPoolNodeSet[0] rid_x_node = sender.nodestack.getRemote(new_node.name).uid messageTimeout = waits.expectedNodeToNodeMessageDeliveryTime() # 3 pc msg (PrePrepare) needs to be discarded _, did = sdk_wallet_client primaryRepl = getPrimaryReplica(txnPoolNodeSet) three_pc = PrePrepare( 0, viewNo, 10, get_utc_epoch(), ["random request digest"], init_discarded(), "random digest", DOMAIN_LEDGER_ID, primaryRepl.stateRootHash(DOMAIN_LEDGER_ID), primaryRepl.txnRootHash(DOMAIN_LEDGER_ID), 0, True ) sender.send(three_pc, rid_x_node) looper.run(eventually(checkDiscardMsg, [new_node, ], three_pc, 'un-acceptable viewNo', retryWait=1, timeout=messageTimeout))
def testPrimarySelectionAfterPoolReady( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): """ Once the pool is ready(node has connected to at least 3 other nodes), appropriate primary replicas should be selected. """ def checkPrimaryPlacement(): # Node names sorted by rank sortedNodes = sorted(txnPoolNodeSet, key=operator.attrgetter("rank")) for idx, node in enumerate(sortedNodes): # For instance 0, the primary replica should be on the node with # rank 0 if idx == 0: Replica.generateName(sortedNodes[idx].name, 0) assert node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 1, the primary replica should be on the node with # rank 1 if idx == 1: Replica.generateName(sortedNodes[idx].name, 1) assert not node.replicas[0].isPrimary assert node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 2, the primary replica should be on the node with # rank 2 if idx == 2: Replica.generateName(sortedNodes[idx].name, 2) assert not node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert node.replicas[2].isPrimary check_rank_consistent_across_each_node(txnPoolNodeSet) # Check if the primary is on the correct node timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) looper.run(eventually(checkPrimaryPlacement, retryWait=1, timeout=timeout)) # Check if every protocol instance has one and only one primary and any node # has no more than one primary checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5)
def testPrimarySelectionAfterPoolReady( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): """ Once the pool is ready(node has connected to at least 3 other nodes), appropriate primary replicas should be selected. """ def checkPrimaryPlacement(): # Node names sorted by rank sortedNodes = sorted(txnPoolNodeSet, key=operator.attrgetter("rank")) for idx, node in enumerate(sortedNodes): # For instance 0, the primary replica should be on the node with # rank 0 if idx == 0: Replica.generateName(sortedNodes[idx].name, 0) assert node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 1, the primary replica should be on the node with # rank 1 if idx == 1: Replica.generateName(sortedNodes[idx].name, 1) assert not node.replicas[0].isPrimary assert node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 2, the primary replica should be on the node with # rank 2 if idx == 2: Replica.generateName(sortedNodes[idx].name, 2) assert not node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert node.replicas[2].isPrimary check_rank_consistent_across_each_node(txnPoolNodeSet) # Check if the primary is on the correct node timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) looper.run(eventually(checkPrimaryPlacement, retryWait=1, timeout=timeout)) # Check if every protocol instance has one and only one primary and any node # has no more than one primary checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5)
def testNodeDiscardMessageFromUnknownView( txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns, sdk_new_node_caught_up, allPluginsPath, sdk_wallet_client): """ Node discards 3-phase or ViewChangeDone messages from view nos that it does not know of (view nos before it joined the pool) :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns viewNo = new_node.viewNo pp_seq_no = get_pp_seq_no(txnPoolNodeSet) # Force two view changes: node discards msgs which have viewNo # at least two less than node's. Current protocol implementation # needs to hold messages from the previous view as well as # from the current view. for i in range(1): ensure_view_change(looper, txnPoolNodeSet) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) pp_seq_no += 1 sender = txnPoolNodeSet[1] rid_x_node = sender.nodestack.getRemote(new_node.name).uid messageTimeout = waits.expectedNodeToNodeMessageDeliveryTime() # 3 pc msg (PrePrepare) needs to be discarded _, did = sdk_wallet_client primaryRepl = getPrimaryReplica(txnPoolNodeSet) inst_id = 0 three_pc = create_pre_prepare_no_bls( primaryRepl.node.db_manager.get_state_root_hash(DOMAIN_LEDGER_ID), viewNo, pp_seq_no=pp_seq_no + 1, inst_id=inst_id) sender.send(three_pc, rid_x_node) looper.run( eventually(checkDiscardMsg, [ new_node.replicas[inst_id].stasher, ], three_pc, OLD_VIEW, retryWait=1, timeout=messageTimeout))
def testPrimarySelectionAfterViewChange(looper, nodeSet, ready, primaryReplicas, viewChangeDone): """ Test that primary replica of a protocol instance shifts to a new node after a view change. """ # Primary replicas before view change prBeforeVC = primaryReplicas # Primary replicas after view change instanceCount = getNoInstances(nodeCount) prAfterVC = [getPrimaryReplica(nodeSet, i) for i in range(instanceCount)] # Primary replicas have moved to the next node for br, ar in zip(prBeforeVC, prAfterVC): assert ar.node.rank - br.node.rank == 1 checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=5)
def testPrimarySelectionAfterViewChange(looper, nodeSet, ready, primaryReplicas, viewChangeDone): """ Test that primary replica of a protocol instance shifts to a new node after a view change. """ # Primary replicas before view change prBeforeVC = primaryReplicas # Primary replicas after view change instanceCount = getNoInstances(nodeCount) prAfterVC = [getPrimaryReplica(nodeSet, i) for i in range(instanceCount)] # Primary replicas have moved to the next node for br, ar in zip(prBeforeVC, prAfterVC): assert ar.node.rank - br.node.rank == 1 checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=5)
def testPrimaryElectionContested(electContFixture, looper, txnPoolNodeSet): """ Primary selection (Rainy Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes, and then sends Nominate(A) At the same exact time, B sees the minimum number of nodes, and then sends out Nominate(B) A sees B sending Nominate(B), but it has already nominated itself, so it does nothing B sees A sending Nominate(A), but it has already nominated itself, so it does nothing C sees A sending Nominate(A), and sends Nominate(A) D sees A sending Nominate(A), and sends Nominate(A) All nodes see that B nominated B and A, C, and D all nominated A Because the votes for A exceeds the votes for B, all send out Primary(A) TODO's (see below) All see the others have sent Primary A, and then the nodes record who is the Primary. """ A, B, C, D = txnPoolNodeSet checkPoolReady(looper, txnPoolNodeSet) logger.debug("Check nomination") timeout = waits.expectedPoolNominationTimeout(nodeCount) # Checking whether Node A nominated itself looper.run(eventually(checkNomination, A, A.name, retryWait=1, timeout=timeout)) # Checking whether Node B nominated itself looper.run(eventually(checkNomination, B, B.name, retryWait=1, timeout=timeout)) for n in [C, D]: # Checking whether Node C and Node D nominated Node A looper.run(eventually(checkNomination, n, A.name, retryWait=1, timeout=timeout)) checkProtocolInstanceSetup(looper=looper, nodes=txnPoolNodeSet, retryWait=1) # Node D should not be primary assert not D.hasPrimary # A should have at least one primary assert A.hasPrimary
def testAdd2NewNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, tconf, steward1, stewardWallet, allPluginsPath): """ Add 2 new nodes to trigger replica addition and primary election """ new_nodes = add_2_nodes(looper, txnPoolNodeSet, steward1, stewardWallet, tdirWithPoolTxns, tconf, allPluginsPath) for n in new_nodes: logger.debug("{} connected to the pool".format(n)) f = getMaxFailures(len(txnPoolNodeSet)) def checkFValue(): for node in txnPoolNodeSet: assert node.f == f assert len(node.replicas) == (f + 1) timeout = waits.expectedClientToPoolConnectionTimeout(len(txnPoolNodeSet)) looper.run(eventually(checkFValue, retryWait=1, timeout=timeout)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
def testPrimarySelectionAfterPoolReady(looper, nodeSet, ready): """ Once the pool is ready(node has connected to at least 3 other nodes), appropriate primary replicas should be selected. """ def checkPrimaryPlacement(): # Node names sorted by rank sortedNodeNames = sorted(nodeSet.nodes.values(), key=operator.attrgetter("rank")) for idx, node in enumerate(sortedNodeNames): # For instance 0, the primary replica should be on the node with rank 0 if idx == 0: Replica.generateName(sortedNodeNames[idx], 0) assert node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 1, the primary replica should be on the node with rank 1 if idx == 1: Replica.generateName(sortedNodeNames[idx], 1) assert not node.replicas[0].isPrimary assert node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 2, the primary replica should be on the node with rank 2 if idx == 2: Replica.generateName(sortedNodeNames[idx], 2) assert not node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert node.replicas[2].isPrimary # Check if the primary is on the correct node looper.run(eventually(checkPrimaryPlacement, retryWait=1, timeout=10)) # Check if every protocol instance has one and only one primary and any node # has no more than one primary checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=5)
def testElectionsAfterViewChange(delayedPerf, looper: Looper, nodeSet: TestNodeSet, up, wallet1, client1): """ Test that a primary election does happen after a view change """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls # and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) # Ensure view change happened for both node and its primary elector for node in nodeSet: looper.run(eventually(partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Ensure elections are done again and pool is setup again with appropriate # protocol instances and each protocol instance is setup properly too checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=30)
def testAdd2NewNodes(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath): """ Add 2 new nodes to trigger replica addition and primary election """ new_nodes = sdk_add_2_nodes(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath) for n in new_nodes: logger.debug("{} connected to the pool".format(n)) f = getMaxFailures(len(txnPoolNodeSet)) def checkFValue(): for node in txnPoolNodeSet: assert node.f == f assert len(node.replicas) == (f + 1) timeout = waits.expectedClientToPoolConnectionTimeout(len(txnPoolNodeSet)) looper.run(eventually(checkFValue, retryWait=1, timeout=timeout)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_pool_refresh(looper, sdk_pool_handle)
def testPrimarySelectionAfterPoolReady(looper, nodeSet, ready): """ Once the pool is ready(node has connected to at least 3 other nodes), appropriate primary replicas should be selected. """ def checkPrimaryPlacement(): # Node names sorted by rank sortedNodeNames = sorted(nodeSet.nodes.values(), key=operator.attrgetter("rank")) for idx, node in enumerate(sortedNodeNames): # For instance 0, the primary replica should be on the node with rank 0 if idx == 0: Replica.generateName(sortedNodeNames[idx], 0) assert node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 1, the primary replica should be on the node with rank 1 if idx == 1: Replica.generateName(sortedNodeNames[idx], 1) assert not node.replicas[0].isPrimary assert node.replicas[1].isPrimary assert not node.replicas[2].isPrimary # For instance 2, the primary replica should be on the node with rank 2 if idx == 2: Replica.generateName(sortedNodeNames[idx], 2) assert not node.replicas[0].isPrimary assert not node.replicas[1].isPrimary assert node.replicas[2].isPrimary # Check if the primary is on the correct node looper.run(eventually(checkPrimaryPlacement, retryWait=1, timeout=10)) # Check if every protocol instance has one and only one primary and any node # has no more than one primary checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=5)
def test_no_propagated_future_view_change_while_view_change(txnPoolNodeSet, looper): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node = txnPoolNodeSet[-1] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate view change in progress lagged_node.view_changer.view_change_in_progress = True old_view_no = checkViewNoForNodes([lagged_node]) initial_vhdc = \ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) # check that lagged node recived 3 Future VCD, but didn't start new view change assert len(other_nodes) + initial_vhdc ==\ lagged_node.view_changer.spylog.count(lagged_node.view_changer.process_future_view_vchd_msg.__name__) assert old_view_no == checkViewNoForNodes([lagged_node])
def test_different_ledger_request_interleave(tconf, looper, txnPoolNodeSet, sdk_one_node_added, tdir, tdirWithPoolTxns, allPluginsPath, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward): """ Send pool and domain ledger requests such that they interleave, and do view change in between and verify the pool is functional """ new_node = sdk_one_node_added sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # Send domain ledger requests but don't wait for replies requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 2) # Add another node by sending pool ledger request _, new_theta = sdk_node_theta_added(looper, txnPoolNodeSet, tdir, tconf, sdk_pool_handle, sdk_wallet_steward, allPluginsPath, name='new_theta') # Send more domain ledger requests but don't wait for replies requests.extend(sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 3)) # Do view change without waiting for replies ensure_view_change(looper, nodes=txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) # Make sure all requests are completed total_timeout = sdk_eval_timeout(len(requests), len(txnPoolNodeSet)) sdk_get_and_check_replies(looper, requests, timeout=total_timeout) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle) new_steward_wallet, steward_did = sdk_add_new_nym(looper, sdk_pool_handle, sdk_wallet_steward, 'another_ste', role='STEWARD') # Send another pool ledger request (NODE) but don't wait for completion of # request next_node_name = 'next_node' sigseed, verkey, bls_key, nodeIp, nodePort, clientIp, clientPort, key_proof = \ prepare_new_node_data(tconf, tdir, next_node_name) node_req = looper.loop.run_until_complete( prepare_node_request(steward_did, new_node_name=next_node_name, clientIp=clientIp, clientPort=clientPort, nodeIp=nodeIp, nodePort=nodePort, bls_key=bls_key, sigseed=sigseed, key_proof=key_proof)) sdk_wallet = (new_steward_wallet, steward_did) request_couple = sdk_sign_and_send_prepared_request(looper, sdk_wallet, sdk_pool_handle, node_req) # Send more domain ledger requests but don't wait for replies request_couples = [request_couple, * sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 5)] # Make sure all requests are completed total_timeout = sdk_eval_timeout(len(request_couples), len(txnPoolNodeSet)) sdk_get_and_check_replies(looper, request_couples, timeout=total_timeout) # Make sure pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def pool(looper, nodeSet): for n in nodeSet: # type: TestNode n.startKeySharing() looper.run(checkNodesConnected(nodeSet)) checkProtocolInstanceSetup(looper, nodeSet, timeout=5) return adict(looper=looper, nodeset=nodeSet)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( looper, txnPoolNodeSet, steward1, stewardWallet, tconf, slow_node): """ Delay 3PC to 1 node and then cause view change so by the time the view change happens(each node gets >n-f `INSTANCE_CHANGE`s), the slow node is behind other nodes. The should initiate catchup to come to the same state as other nodes. """ fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = tconf.PerfCheckFreq # Bad network introduced slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(2 * delay, 0)) slow_node.nodeIbStasher.delay(cDelay(3 * delay, 0)) for i in range(2): sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 20) waitNodeDataInequality(looper, slow_node, *fast_nodes) catchup_reply_counts = { n.name: n.ledgerManager.spylog.count(n.ledgerManager.processCatchupRep) for n in txnPoolNodeSet } catchup_done_counts = { n.name: n.spylog.count(n.allLedgersCaughtUp) for n in txnPoolNodeSet } def slow_node_processed_some(): assert slow_node.master_replica.batches # The slow node has received some PRE-PREPAREs looper.run(eventually(slow_node_processed_some, retryWait=1, timeout=delay)) # No reverts have been called by the slow node rv = getAllReturnVals(slow_node.replicas[0], TestReplica.revert_unordered_batches) assert not rv or max(rv) == 0 # Delay reception of catchup replies so ViewChangeDone can be received # before catchup completes delay_catchup_reply = 2 slow_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) ensure_view_change(looper, txnPoolNodeSet) # `slow_node` will not have elections done but others will. checkProtocolInstanceSetup(looper, fast_nodes, numInstances=len(slow_node.replicas), retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # `slow_node` does catchup, `fast_nodes` don't for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_done_counts[n.name] if n == slow_node: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) > catchup_reply_counts[ n.name] else: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) == catchup_reply_counts[ n.name] # Greater than 0 batches were reverted by the slow node assert max( getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches)) > 0 # Bad network repaired slow_node.reset_delays_and_process_delayeds() # Make sure pool is functional sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied bad_node.nodeIbStasher.delay(lsDelay(1000)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False)) def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = has_ordered_till_last_prepared_certificate_count() old_count_3 = is_catchup_not_needed_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 assert is_catchup_not_needed_count() > old_count_3 # The bad_node caught up due to ordering till last prepared certificate assert has_ordered_till_last_prepared_certificate_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def testPrimaryElectionWithTie(electTieFixture, looper, keySharedNodes): """ Primary selection (Rainy Day) A, B, C, D, E A, B, C, D startup. E is lagging. A sees the minimum number of nodes, and then sends Nominate(A) At the same exact time, B sees the minimum number of nodes, and then sends out Nominate(B) A sees B sending Nominate(B), but it has already nominated itself, so it does nothing B sees A sending Nominate(A), but it has already nominated itself, so it does nothing C sees A sending Nominate(A), and sends Nominate(A) D sees B sending Nominate(B), and sends Nominate(B) There's a split. C and A think A is the primary, B and D think B is the primary All nodes can see that there is a split. Each sends out Reelection([A,B]) A and B both see Reelection([A,B]) from themselves as well as the other 3 (the number from others should be at least f+1), 1. they wait a random amount of time (between 0 and 2 seconds), 2. they each send out a Nominate(self) Voting is repeated until we have a good election. """ # TODO optimize the sending messages in batches, for example, we don't # send messages more often than 400 milliseconds. Once those 400 # millis have passed, we send the several queued messages in one # batch. nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() checkPoolReady(looper, nodeSet.nodes.values()) for node in nodeSet.nodes.values(): for instId, replica in enumerate(node.elector.replicas): logger.debug("replica {} {} with votes {}".format( replica.name, replica.instId, node.elector.nominations.get(instId, {}))) nominationTimeout = waits.expectedPoolNominationTimeout(len(nodeSet)) logger.debug("Check nomination") # Checking whether Node A nominated itself looper.run( eventually(checkNomination, A, A.name, retryWait=1, timeout=nominationTimeout)) # Checking whether Node B nominated itself looper.run( eventually(checkNomination, B, B.name, retryWait=1, timeout=nominationTimeout)) # Checking whether Node C nominated Node A looper.run( eventually(checkNomination, C, A.name, retryWait=1, timeout=nominationTimeout)) # Checking whether Node D nominated Node D looper.run( eventually(checkNomination, D, B.name, retryWait=1, timeout=nominationTimeout)) # No node should be primary for node in nodeSet.nodes.values(): assert node.hasPrimary is False for node in nodeSet.nodes.values(): node.resetDelays() checkProtocolInstanceSetup(looper=looper, nodes=nodeSet, retryWait=1)
def test_slow_node_reverts_unordered_state_during_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) assert len(slow_node.stashedOrderedReqs) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # COMMITs are stashed assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually( chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def pool(looper, nodeSet): for n in nodeSet: # type: TestNode n.startKeySharing() looper.run(checkNodesConnected(nodeSet)) checkProtocolInstanceSetup(looper, nodeSet, timeout=5) return adict(looper=looper, nodeset=nodeSet)
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): check_last_ordered_3pc(n1, n2) assert slow_master_replica.last_prepared_before_view_change == old_last_ordered old_pc_count = slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \ delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were received but stashed and # they will processed after catchup assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)