def test_view_change_by_order_stashed_on_3_nodes_and_catchup_on_1_node(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' - COMMITS are delayed on all nodes - All nodes starts a view change with a prepared certificate (for delayed message) - COMMITS come during view change for 3 nodes - So these 3 nodes finish view change by processing Commits and Ordered msgs during view change (in between rounds of catchup). - The lagging (4th) node receives missing txns as part of catch-up (during view change) and also finishes view change. ''' slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] slow_stasher = slow_node.nodeIbStasher fast_stashers = [n.nodeIbStasher for n in fast_nodes] all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] initial_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc txns_count = 4 eventual_last_ordered = initial_last_ordered[0], initial_last_ordered[1] + txns_count batches_count = initial_last_ordered[1] with delay_rules(all_stashers, vcd_delay()): # the lagging node is slow in receiving Commits and Catchup mghs with delay_rules(slow_stasher, cDelay()): with delay_rules(slow_stasher, lsDelay(), msg_rep_delay(types_to_delay=[LEDGER_STATUS])): # fast nodes will receive and order Commits for last_prepared_cert during view change with delay_rules(fast_stashers, cDelay()): with delay_rules(fast_stashers, lsDelay(), msg_rep_delay(types_to_delay=[LEDGER_STATUS])): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_steward, txns_count) batches_count += txns_count looper.run(eventually(check_prepare_certificate, txnPoolNodeSet, batches_count)) check_last_ordered_3pc_on_master(txnPoolNodeSet, initial_last_ordered) # trigger view change on all nodes ensure_view_change(looper, txnPoolNodeSet) looper.run(eventually(check_last_prepared_certificate_after_view_change_start, txnPoolNodeSet, eventual_last_ordered)) # check that all txns are ordered till last prepared on fast nodes looper.run(eventually(check_last_ordered_3pc_on_master, fast_nodes, eventual_last_ordered, timeout=30)) # check that all txns are ordered till last prepared on slow node as a result of catchup looper.run(eventually(check_last_ordered_3pc_on_master, [slow_node], eventual_last_ordered, timeout=30)) # wait for view change done on all nodes ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # make sure that the pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_node_reject_invalid_txn_during_catchup(looper, sdk_pool_handle, sdk_wallet_client, tconf, tdir, txnPoolNodeSet, bad_node, lagging_node): """ Make sure that catching up node will blacklist nodes which send incorrect catchup replies """ normal_nodes = [ node for node in txnPoolNodeSet if node not in [bad_node, lagging_node] ] normal_stashers = [node.nodeIbStasher for node in normal_nodes] with delay_rules_without_processing(lagging_node.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Perform catchup, while making sure that cons proof from bad node is received # before cons proofs from normal nodes, so bad node can participate in catchup with delay_rules(normal_stashers, lsDelay()): lagging_node.start_catchup() node_leecher = lagging_node.ledgerManager._node_leecher audit_cons_proof_service = node_leecher._leechers[ AUDIT_LEDGER_ID]._cons_proof_service looper.run( eventually(lambda: assert_in( bad_node.name, audit_cons_proof_service._cons_proofs))) waitNodeDataEquality(looper, lagging_node, *normal_nodes) assert lagging_node.isNodeBlacklisted(bad_node.name)
def test_view_change_by_order_stashed_on_all(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): ''' - COMMITS are delayed on all nodes - All nodes starts a view change with a prepared certificate (for delayed message) - COMMITS come during view change for all nodes, - So all nodes finish view change by processing Commits and Ordered msgs during view change (in between rounds of catchup). ''' all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] initial_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc txns_count = 4 eventual_last_ordered = initial_last_ordered[ 0], initial_last_ordered[1] + txns_count with delay_rules(all_stashers, vcd_delay()): with delay_rules(all_stashers, cDelay()): with delay_rules(all_stashers, lsDelay(), msg_rep_delay(types_to_delay=[LEDGER_STATUS])): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_steward, txns_count) looper.run( eventually(check_prepare_certificate, txnPoolNodeSet, txns_count)) check_last_ordered_3pc_on_master(txnPoolNodeSet, initial_last_ordered) # trigger view change on all nodes ensure_view_change(looper, txnPoolNodeSet) looper.run( eventually( check_last_prepared_certificate_after_view_change_start, txnPoolNodeSet, eventual_last_ordered)) # check that all txns are ordered till last prepared looper.run( eventually(check_last_ordered_3pc_on_master, txnPoolNodeSet, eventual_last_ordered, timeout=30)) # wait for view change done on all nodes ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # make sure that the pool is functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_catchup_from_unequal_nodes_without_reasking(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagged_node_1 = txnPoolNodeSet[-1] lagged_node_2 = txnPoolNodeSet[-2] normal_nodes = [ node for node in txnPoolNodeSet if node not in [lagged_node_1, lagged_node_2] ] normal_stashers = [node.nodeIbStasher for node in normal_nodes] with delay_rules_without_processing(lagged_node_1.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) with delay_rules_without_processing(lagged_node_2.nodeIbStasher, delay_3pc()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 7) ensure_all_nodes_have_same_data(looper, normal_nodes, custom_timeout=30) # Perform catchup, while making sure that cons proof from lagging node is received # before cons proofs from normal nodes, so lagging node can participate in catchup with delay_rules(normal_stashers, lsDelay()): lagged_node_1.start_catchup() node_leecher = lagged_node_1.ledgerManager._node_leecher audit_cons_proof_service = node_leecher._leechers[ AUDIT_LEDGER_ID]._cons_proof_service looper.run( eventually(lambda: assertExp(lagged_node_2.name in audit_cons_proof_service. _cons_proofs))) # Make sure catchup finishes successfully ensure_all_nodes_have_same_data(looper, set(txnPoolNodeSet) - {lagged_node_2}, custom_timeout=30) assert lagged_node_1.ledgerManager._node_leecher._state == NodeLeecherService.State.Idle
def test_catchup_with_reask_ls(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ''' Start a catchup Delay MessageqResp with LedgerStatuses twice Check that the catchup finished ''' lagged_node = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) lagged_node.nodeIbStasher.delay(msg_rep_delay(types_to_delay=[COMMIT])) with delay_rules_without_processing( lagged_node.nodeIbStasher, delay_3pc(), msg_rep_delay(types_to_delay=[COMMIT])): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) lagged_node.nodeIbStasher.drop_delayeds() with delay_rules_without_processing( lagged_node.nodeIbStasher, lsDelay(), msg_rep_delay(types_to_delay=[LEDGER_STATUS])): lagged_node.start_catchup() def chk(): resp_ls_count = 0 for msg in lagged_node.nodeIbStasher.delayeds: if isinstance( msg.item[0], MessageRep) and msg.item[0].msg_type == LEDGER_STATUS: resp_ls_count += 1 assert resp_ls_count >= (len(txnPoolNodeSet) - 1) * 2 lagged_node.nodeIbStasher.drop_delayeds() looper.run(eventually(chk)) waitNodeDataEquality(looper, lagged_node, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) start_view_change start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) slow_node.nodeIbStasher.delay(cr_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[ 0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node stashed commits assert slow_node.master_replica.stasher.num_stashed_catchup == \ sent_batches * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CATCHUP_REP) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed messages successfully assert slow_node.master_replica.stasher.num_stashed_catchup == 0 looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_3pc_while_catchup_with_chkpoints_only( tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Check that catch-up is not started again even if a quorum of stashed checkpoints is received during catch-up. Assume that only checkpoints and no 3PC messages are received. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) # delay all 3PC messages on the lagged node so that it # receives only Checkpoints and catch-up messages lagging_node.nodeIbStasher.delay(ppDelay()) lagging_node.nodeIbStasher.delay(pDelay()) lagging_node.nodeIbStasher.delay(cDelay()) # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test). with delay_rules(lagging_node.nodeIbStasher, cs_delay()): with delay_rules(lagging_node.nodeIbStasher, lsDelay(), msg_rep_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got ledger statuses for messages missed while the node was offline, # so that now we can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp( lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode != Mode.participating # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 num_reqs = reqs_for_checkpoint * num_checkpoints + 1 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs) looper.run( eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes, (0, num_reqs + 2))) # all good nodes stabilized checkpoint looper.run(eventually(chkChkpoints, rest_nodes, 2, 0)) assert lagging_node.mode != Mode.participating # lagging node is catching up and stashing all checkpoints looper.run( eventually(lambda: assertExp( get_stashed_checkpoints(lagging_node) == num_checkpoints * len(rest_nodes)), timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # check that last_ordered is set looper.run( eventually(check_last_ordered_3pc_on_master, [lagging_node], (0, num_reqs + 2))) # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # check that catch-up was started only once assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1 looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1))) looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.start_catchup) == 1))) # do not check for audit ledger since we didn't catch-up audit ledger when txns were ordering durinf catch-up waitNodeDataEquality(looper, *txnPoolNodeSet, exclude_from_check='check_audit', customTimeout=5)
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): check_last_ordered_3pc(n1, n2) assert slow_master_replica.last_prepared_before_view_change == old_last_ordered old_pc_count = slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \ delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were received but stashed and # they will processed after catchup assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def slow_catch_up(nodes, timeout): for node in nodes: node.nodeIbStasher.delay(lsDelay(timeout)) node.clientIbStasher.delay(lsDelay(timeout))
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) startViewChange start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals(slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node orders all requests as others have assert last_ordered[0] == slow_node.master_last_ordered_3PC looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] # Bad node does not process any 3 Commit messages, equivalent to messages with delay_rules(bad_node.nodeIbStasher, cDelay()): # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied bad_node.nodeIbStasher.delay(lsDelay(1000)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False)) def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True)) old_is_catchup_needed_count = is_catchup_needed_count() old_is_catchup_not_needed_count = is_catchup_not_needed_count() old_has_ordered_till_last_prepared_certificate_count = has_ordered_till_last_prepared_certificate_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc', 'check_audit', 'check_last_ordered_3pc_backup']) assert is_catchup_needed_count() == old_is_catchup_needed_count assert is_catchup_not_needed_count() > old_is_catchup_not_needed_count # The bad_node caught up due to ordering till last prepared certificate assert has_ordered_till_last_prepared_certificate_count() > old_has_ordered_till_last_prepared_certificate_count ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_checkpoints_after_view_change(tconf, looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): ''' Tests that there is no infinite catchups if there is a quorum of stashed checkpoints received during the view change ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) initial_start_catchup = lagging_node.spylog.count(Node.start_catchup) with delay_rules(lagging_node.nodeIbStasher, lsDelay()): with delay_rules(lagging_node.nodeIbStasher, vcd_delay()): ensure_view_change(looper, txnPoolNodeSet) looper.run( eventually(lambda: assertExp(lagging_node. view_change_in_progress is True), timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) ensureElectionsDone(looper=looper, nodes=rest_nodes, instances_list=range(2)) assert all(n.view_change_in_progress is False for n in rest_nodes) assert lagging_node.view_change_in_progress is True # make sure that more requests are being ordered while catch-up is in progress on the lagging node # stash enough stable checkpoints for starting a catch-up num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 num_reqs = reqs_for_checkpoint * num_checkpoints + 1 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs) looper.run( eventually(check_last_ordered_3pc_on_master, rest_nodes, (1, num_reqs + 1))) looper.run( eventually(check_last_ordered_3pc_on_backup, rest_nodes, (1, num_reqs + 1))) # all good nodes stabilized checkpoint looper.run(eventually(chkChkpoints, rest_nodes, 2, 0)) assert get_stashed_checkpoints( lagging_node) == num_checkpoints * len(rest_nodes) # lagging node is doing the view change and stashing all checkpoints assert lagging_node.view_change_in_progress is True looper.run( eventually(lambda: assertExp( get_stashed_checkpoints(lagging_node) == 2 * len(rest_nodes )), timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # check that view change is finished ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert lagging_node.view_change_in_progress is False # check that last_ordered is set looper.run( eventually(check_last_ordered_3pc_on_master, [lagging_node], (1, num_reqs + 1))) looper.run( eventually(check_last_ordered_3pc_on_backup, [lagging_node], (1, num_reqs + 1))) # check that checkpoint is stabilized for master looper.run(eventually(chk_chkpoints_for_instance, [lagging_node], 0, 2, 0)) # check that the catch-up is finished assert lagging_node.mode == Mode.participating assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1 assert lagging_node.spylog.count( Node.start_catchup) == initial_start_catchup + 1 waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied bad_node.nodeIbStasher.delay(lsDelay(1000)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False)) def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = has_ordered_till_last_prepared_certificate_count() old_count_3 = is_catchup_not_needed_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 assert is_catchup_not_needed_count() > old_count_3 # The bad_node caught up due to ordering till last prepared certificate assert has_ordered_till_last_prepared_certificate_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def test_slow_node_reverts_unordered_state_during_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) assert len(slow_node.stashedOrderedReqs) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # COMMITs are stashed assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually( chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)