def test_node_load_after_add_then_disconnect(sdk_new_node_caught_up, txnPoolNodeSet, tconf, looper, sdk_pool_handle, sdk_wallet_client, tdirWithPoolTxns, allPluginsPath, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = sdk_new_node_caught_up with capsys.disabled(): print("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, txns_per_batch) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s)) with capsys.disabled(): print("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) new_node = TestNode( new_node.name, basedirpath=tdirWithPoolTxns, base_data_dir=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node # Delay catchup reply processing so LedgerState does not change delay_catchup_reply = 5 new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], customTimeout=timeout) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def test_view_change_during_unstash(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf): slow_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] slow_stasher = slow_node.nodeIbStasher other_stashers = [n.nodeIbStasher for n in other_nodes] all_stashers = [n.nodeIbStasher for n in txnPoolNodeSet] # Preload nodes with some transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for node in txnPoolNodeSet: assert node.master_replica.last_ordered_3pc == (0, 1) # Prevent ordering of some requests start_delaying(all_stashers, delay_3pc(after=7, msgs=(Prepare, Commit))) # Stop ordering on slow node and send requests slow_node_after_5 = start_delaying(slow_stasher, delay_3pc(after=5, msgs=Commit)) slow_node_until_5 = start_delaying(slow_stasher, delay_3pc(after=0)) reqs_view_0 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 8) # Make pool order first 2 batches and pause pool_after_3 = start_delaying(other_stashers, delay_3pc(after=3)) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 3)) # Start catchup, continue ordering everywhere (except two last batches on slow node) with delay_rules(slow_stasher, cr_delay()): slow_node._do_start_catchup(just_started=False) looper.run(eventually(check_catchup_is_started, slow_node)) stop_delaying_and_process(pool_after_3) looper.run(eventually(check_nodes_ordered_till, other_nodes, 0, 7)) # Finish catchup and continue processing on slow node looper.run(eventually(check_catchup_is_finished, slow_node)) stop_delaying_and_process(slow_node_until_5) looper.run(eventually(check_nodes_ordered_till, [slow_node], 0, 5)) # Start view change and allow slow node to get remaining commits with delay_rules(all_stashers, icDelay()): for node in txnPoolNodeSet: node.view_changer.on_master_degradation() looper.runFor(0.1) stop_delaying_and_process(slow_node_after_5) # Ensure that expected number of requests was ordered replies = sdk_get_replies(looper, reqs_view_0) for rep in replies[:6]: sdk_check_reply(rep) # Ensure that everything is ok ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def setup(request, looper, txnPoolNodeSet): slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] # Delay catchup reply so that the test gets time to make the check, # this delay is reset after the check slow_node.nodeIbStasher.delay(cr_delay(100)) slow_node.nodeIbStasher.delay(pDelay(100, 0)) slow_node.nodeIbStasher.delay(cDelay(100, 0)) if request.param == 'all': slow_node.nodeIbStasher.delay(ppDelay(100, 0)) return slow_node, fast_nodes
def scenario_txns_during_catchup( looper, tconf, tdir, allPluginsPath, do_post_node_creation, nodes, send_txns ): lagging_node = nodes[-1] rest_nodes = nodes[:-1] # Stop NodeX lagging_node.cleanupOnStopping = False disconnect_node_and_ensure_disconnected(looper, nodes, lagging_node.name, stopNode=True) looper.removeProdable(name=lagging_node.name) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes) # Start NodeX lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) do_post_node_creation(lagging_node) HelperNode.fill_auth_map_for_node(lagging_node, XFER_PUBLIC) HelperNode.fill_auth_map_for_node(lagging_node, NYM) nodes[-1] = lagging_node # Delay CathupRep for DOMAIN ledger for NodeX with delay_rules( lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID) ): # allow started node to receive looper events looper.add(lagging_node) # ensure it connected to others looper.run(checkNodesConnected(nodes)) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes) # Reset delays # Make sure that all nodes have equal state ensure_all_nodes_have_same_data(looper, nodes) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes)
def test_freshness_in_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, ): """ Check that InstanceChange with reason "State signatures are not updated for too long" are not sends in catchup without view change. """ view_no = txnPoolNodeSet[0].viewNo lagging_node = txnPoolNodeSet[-1] with delay_rules(lagging_node.nodeIbStasher, cr_delay(), cDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) lagging_node.start_catchup() looper.runFor(tconf.ACCEPTABLE_FRESHNESS_INTERVALS_COUNT * tconf.STATE_FRESHNESS_UPDATE_INTERVAL + 5) print(lagging_node.view_changer.instance_changes._cache) assert not lagging_node.view_changer.instance_changes.has_inst_chng_from(view_no + 1, lagging_node.name)
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp) # delay CurrentState to avoid Primary Propagation (since it will lead to more catch-ups not needed in this test). with delay_rules(lagging_node.nodeIbStasher, cs_delay()): with delay_rules(lagging_node.nodeIbStasher, cr_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got catchup replies for messages missed while the node was offline, # so that now qwe can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp(len(lagging_node.nodeIbStasher.delayeds) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode == Mode.syncing # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint * (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1)) assert lagging_node.mode == Mode.syncing # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) ) ) looper.run( eventually( lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1) ) ) # check that catch-up was started only once looper.run( eventually( lambda: assertExp( lagging_node.spylog.count(Node.start_catchup) == 1) ) ) # check that the node was able to order requests stashed during catch-up waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 15 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_twice(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(slow_node, slow_node.is_catchup_needed, compare_val_to=True)) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) old_cn_count = is_catchup_needed_count() # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) # Repair the network so COMMITs are received and processed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress ) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk3, retryWait=1, timeout=catchup_rep_delay + 5)) def chk4(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk4, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) def chk5(): # Once catchup was done, need of other catchup was not found assertEquality(is_catchup_needed_count(), old_cn_count) looper.run(eventually(chk5, retryWait=1, timeout=5)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_3pc_while_catchup_with_chkpoints_only(tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Check that catch-up is not started again even if a quorum of stashed checkpoints is received during catch-up. Assume that only checkpoints and no 3PC messages are received. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count(Node.allLedgersCaughtUp) # delay all 3PC messages on the lagged node so that it # receives only Checkpoints and catch-up messages lagging_node.nodeIbStasher.delay(ppDelay()) lagging_node.nodeIbStasher.delay(pDelay()) lagging_node.nodeIbStasher.delay(cDelay()) with delay_rules(lagging_node.nodeIbStasher, lsDelay(), cr_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got ledger statuses for messages missed while the node was offline, # so that now we can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp(lagging_node.nodeIbStasher.num_of_stashed(LedgerStatus) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode != Mode.participating # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 num_reqs = reqs_for_checkpoint * num_checkpoints + 1 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs) looper.run( eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes, (0, num_reqs + 2)) ) # all good nodes stabilized checkpoint looper.run(eventually(chkChkpoints, rest_nodes, 2, 0)) assert lagging_node.mode != Mode.participating # lagging node is catching up and stashing all checkpoints looper.run( eventually( lambda: assertExp(get_stashed_checkpoints(lagging_node) == num_checkpoints * len(rest_nodes)), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) ) ) # check that last_ordered is set looper.run( eventually(check_last_ordered_3pc_on_master, [lagging_node], (0, num_reqs + 2)) ) # check that the catch-up is finished looper.run( eventually( lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) ) ) # check that catch-up was started twice, since we were able to catch-up till audit ledger only # for the first time, and after this the node sees a quorum of stashed checkpoints assert lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1 assert lagging_node.spylog.count(Node.start_catchup) == 1 waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_unstash_three_phase_msg_after_catchup(txnPoolNodeSet, looper, tconf, sdk_pool_handle, sdk_wallet_steward): """ 1. Delay Commit on Node4 2. Order 1 req 3. Delay Commit on all nodes 4. Order 1 req 5. Delay CatchupRep on Node4 6. Delay Ledger Status and ViewChangeDones on Nodes1-3 7. Start View change on all nodes 8. Wait until Node4 got 3 stashed CatchupReps 9. Reset delaying of Commits on all Nodes 10. Reset Ledger Status on Nodes1-3 11. Check that 3 nodes finished VC while Node4 is syncing and not finished 12. Reset CatchupRep on Node4 13. Check that Node4 finished VC, and there was just 1 round of catch-up """ slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] view_no = txnPoolNodeSet[0].viewNo old_stashed = slow_node.master_replica.stasher.stash_size(STASH_VIEW_3PC) last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc batches_count = last_ordered[1] with delay_rules( [n.nodeIbStasher for n in txnPoolNodeSet], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): # Delay Commit messages for slow_node. slow_node.nodeIbStasher.delay(cDelay(sys.maxsize)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) batches_count += 1 # Delay Commit messages for fast_nodes. for n in fast_nodes: n.nodeIbStasher.delay(cDelay(sys.maxsize)) request2 = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_steward) batches_count += 1 def check_commits(commit_key): for n in fast_nodes: for r in n.replicas.values(): assert commit_key in r._ordering_service.commits assert len( r._ordering_service.commits[commit_key].voters) == 1 looper.run( eventually(check_commits, (view_no, last_ordered[1] + batches_count))) # Delay CatchupRep messages for the slow_node. with delay_rules([slow_node.nodeIbStasher], cr_delay()): with delay_rules([n.nodeIbStasher for n in fast_nodes], msg_rep_delay(types_to_delay=[LEDGER_STATUS])): for n in txnPoolNodeSet: n.start_catchup() looper.run( eventually( lambda: assertExp(slow_node.mode == Mode.discovering))) # Reset delay Commit messages for all nodes. for n in txnPoolNodeSet: n.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) assert slow_node.mode == Mode.discovering looper.run( eventually(_check_nodes_stashed, fast_nodes, old_stashed, len(txnPoolNodeSet) - 1)) looper.run( eventually(_check_nodes_stashed, [slow_node], old_stashed, (len(txnPoolNodeSet) - 1) * 2)) sdk_get_and_check_replies(looper, [request2]) _check_nodes_stashed(fast_nodes, old_stashed, 0) assert get_pp_seq_no(txnPoolNodeSet) == batches_count ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.discovering), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = \ repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp( repaired_node.master_replica._checkpointer.spylog.count(CheckpointService.process_checkpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.discovering logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == other_nodes[0].domainLedger.size
def test_3pc_while_catchup_with_chkpoints(tdir, tconf, looper, chkFreqPatched, reqs_for_checkpoint, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Tests that 3PC messages and Checkpoints being ordered during catch-up are stashed and re-applied when catch-up is finished. Check that catch-up is not started again even if a quorum of stashed checkpoints is received. ''' batches_count = get_pp_seq_no(txnPoolNodeSet) # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got catchup replies for messages missed while the node was offline, # so that now qwe can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp( lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3), retryWait=1, timeout=60)) assert lagging_node.mode == Mode.syncing # make sure that more requests are being ordered while catch-up is in progress # stash enough stable checkpoints for starting a catch-up num_checkpoints = Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 num_reqs = reqs_for_checkpoint * num_checkpoints + 1 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs) batches_count += num_reqs looper.run( eventually(check_last_ordered_3pc_on_all_replicas, rest_nodes, (0, batches_count))) # all good nodes stabilized checkpoint looper.run( eventually(check_for_nodes, rest_nodes, check_stable_checkpoint, 10)) # lagging node is catching up and stashing all checkpoints assert lagging_node.mode == Mode.syncing looper.run( eventually(lambda: assertExp( get_stashed_checkpoints(lagging_node) == num_checkpoints * len( rest_nodes)), timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # check that last_ordered is set looper.run( eventually(check_last_ordered_3pc_on_all_replicas, [lagging_node], (0, batches_count))) # check that checkpoint is stabilized for master looper.run( eventually(check_for_instance, [lagging_node], 0, check_stable_checkpoint, 10)) # check that the catch-up is finished looper.run( eventually(lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # check that catch-up was started only once looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1))) looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.start_catchup) == 1))) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5)
def test_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath): ''' Tests that requests being ordered during catch-up are stashed and re-applied when catch-up is finished ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) assert all(replica.stasher.num_stashed_catchup == 0 for inst_id, replica in lagging_node.replicas.items()) with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # wait till we got catchup replies for messages missed while the node was offline, # so that now we can order more messages, and they will not be caught up, but stashed looper.run( eventually(lambda: assertExp( lagging_node.nodeIbStasher.num_of_stashed(CatchupRep) >= 3), retryWait=1, timeout=60)) # make sure that more requests are being ordered while catch-up is in progress sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) assert lagging_node.mode == Mode.syncing assert all(replica.stasher.num_stashed_catchup > 0 for inst_id, replica in lagging_node.replicas.items()) # check that the catch-up is finished looper.run( eventually(lambda: assertExp(lagging_node.mode == Mode.participating), retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) looper.run( eventually(lambda: assertExp( lagging_node.spylog.count(Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 1))) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5) assert all(replica.stasher.num_stashed_catchup == 0 for inst_id, replica in lagging_node.replicas.items())
def test_node_catchup_after_restart_with_txns( sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf, sdk_node_set_with_node_added_after_some_txns, allPluginsPath): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") more_requests = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, more_requests) logger.debug("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) newNode = TestNode( new_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode # Make sure ledger is not synced initially check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced) # Delay catchup reply processing so LedgerState does not change # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state' # fails because newNode's domain ledger state is 'synced' delay_catchup_reply = 10 newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, newNode, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) confused_node = txnPoolNodeSet[0] new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID] cp = new_node_ledger.catchUpTill start, end = cp.seqNoStart, cp.seqNoEnd cons_proof = confused_node.ledgerManager._buildConsistencyProof( DOMAIN_LEDGER_ID, start, end) bad_send_time = None def chk(): nonlocal bad_send_time entries = newNode.ledgerManager.spylog.getAll( newNode.ledgerManager.canProcessConsistencyProof.__name__) for entry in entries: # `canProcessConsistencyProof` should return False after `syncing_time` if entry.result == False and entry.starttime > bad_send_time: return assert False def send_and_chk(ledger_state): nonlocal bad_send_time, cons_proof bad_send_time = perf_counter() confused_node.ledgerManager.sendTo(cons_proof, newNode.name) # Check that the ConsistencyProof messages rejected looper.run(eventually(chk, retryWait=.5, timeout=5)) check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state) send_and_chk(LedgerState.syncing) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1], customTimeout=timeout) send_and_chk(LedgerState.synced)
def delay_catchup(ledger_id: int): _delayer = cr_delay(ledger_filter=ledger_id) _delayer.__name__ = "delay_catchup({})".format(ledger_id) return _delayer
def test_unstash_three_phase_msg_after_catchup_in_view_change( txnPoolNodeSet, looper, tconf, sdk_pool_handle, sdk_wallet_steward): """ 1. Delay Commit on Node4 2. Order 1 req 3. Delay Commit on all nodes 4. Order 1 req 5. Delay CatchupRep on Node4 6. Delay Ledger Status and ViewChangeDones on Nodes1-3 7. Start View change on all nodes 8. Wait until Node4 got 3 stashed CatchupReps 9. Reset delaying of Commits on all Nodes 10. Reset Ledger Status on Nodes1-3 11. Check that 3 nodes finished VC while Node4 is syncing and not finished 12. Reset CatchupRep on Node4 13. Check that Node4 finished VC, and there was just 1 round of cacth-up (edited) """ slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] view_no = txnPoolNodeSet[0].viewNo old_stashed = slow_node.master_replica.stasher.num_stashed_future_view last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc with delay_rules( [n.nodeIbStasher for n in txnPoolNodeSet], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): # Delay Commit messages for slow_node. slow_node.nodeIbStasher.delay(cDelay(sys.maxsize)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) # Delay Commit messages for fast_nodes. for n in fast_nodes: n.nodeIbStasher.delay(cDelay(sys.maxsize)) request2 = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_steward) def check_commits(commit_key): for n in fast_nodes: for r in n.replicas.values(): assert commit_key in r.commits assert len(r.commits[commit_key].voters) == 1 looper.run(eventually(check_commits, (view_no, last_ordered[1] + 2))) # Delay CatchupRep messages for the slow_node. with delay_rules([slow_node.nodeIbStasher], cr_delay()): with delay_rules([n.nodeIbStasher for n in fast_nodes], vcd_delay()): with delay_rules( [n.nodeIbStasher for n in fast_nodes], msg_rep_delay(types_to_delay=[LEDGER_STATUS])): for n in txnPoolNodeSet: n.view_changer.on_master_degradation() looper.run( eventually( lambda: assertExp(slow_node.mode == Mode.syncing))) # Reset delay Commit messages for all nodes. for n in txnPoolNodeSet: n.nodeIbStasher.reset_delays_and_process_delayeds( COMMIT) assert slow_node.view_change_in_progress assert slow_node.mode == Mode.syncing looper.run( eventually(_check_nodes_stashed, fast_nodes, old_stashed, len(txnPoolNodeSet) - 1)) looper.run( eventually(_check_nodes_stashed, [slow_node], old_stashed, (len(txnPoolNodeSet) - 1) * 2)) waitForViewChange(looper, fast_nodes, expectedViewNo=view_no + 1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=fast_nodes, instances_list=range( fast_nodes[0].requiredNumberOfInstances), customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) sdk_get_and_check_replies(looper, [request2]) waitForViewChange(looper, [slow_node], expectedViewNo=view_no + 1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) _check_nodes_stashed(fast_nodes, old_stashed, 0) assert all(n.master_replica.last_ordered_3pc == (last_ordered[0], last_ordered[1] + 2) for n in txnPoolNodeSet) assert slow_node.catchup_rounds_without_txns == 1
def test_slow_node_reverts_unordered_state_during_catchup( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): check_last_ordered_3pc(n1, n2) assert slow_master_replica.last_prepared_before_view_change == old_last_ordered old_pc_count = slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica._ordering_service.spylog.count( slow_master_replica._ordering_service._validate) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # (delay_batches * Max3PCBatchSize * commits_count_in_phase) COMMITs are stashed assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == \ delay_batches * Max3PCBatchSize * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were received but stashed and # they will processed after catchup assert slow_node.master_replica.stasher.stash_size(STASH_CATCH_UP) == 0 looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 50)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually(chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def delay_catchup(catchup_state: NodeLeecherService.State): ledger_id = NodeLeecherService.state_to_ledger[catchup_state] _delayer = cr_delay(ledger_filter=ledger_id) _delayer.__name__ = "delay_catchup({})".format(ledger_id) return _delayer
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( looper, txnPoolNodeSet, steward1, stewardWallet, tconf, slow_node): """ Delay 3PC to 1 node and then cause view change so by the time the view change happens(each node gets >n-f `INSTANCE_CHANGE`s), the slow node is behind other nodes. The should initiate catchup to come to the same state as other nodes. """ fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = tconf.PerfCheckFreq # Bad network introduced slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(2 * delay, 0)) slow_node.nodeIbStasher.delay(cDelay(3 * delay, 0)) for i in range(2): sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 20) waitNodeDataInequality(looper, slow_node, *fast_nodes) catchup_reply_counts = { n.name: n.ledgerManager.spylog.count(n.ledgerManager.processCatchupRep) for n in txnPoolNodeSet } catchup_done_counts = { n.name: n.spylog.count(n.allLedgersCaughtUp) for n in txnPoolNodeSet } def slow_node_processed_some(): assert slow_node.master_replica.batches # The slow node has received some PRE-PREPAREs looper.run(eventually(slow_node_processed_some, retryWait=1, timeout=delay)) # No reverts have been called by the slow node rv = getAllReturnVals(slow_node.replicas[0], TestReplica.revert_unordered_batches) assert not rv or max(rv) == 0 # Delay reception of catchup replies so ViewChangeDone can be received # before catchup completes delay_catchup_reply = 2 slow_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) ensure_view_change(looper, txnPoolNodeSet) # `slow_node` will not have elections done but others will. checkProtocolInstanceSetup(looper, fast_nodes, numInstances=len(slow_node.replicas), retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # `slow_node` does catchup, `fast_nodes` don't for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_done_counts[n.name] if n == slow_node: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) > catchup_reply_counts[ n.name] else: assert n.ledgerManager.spylog.count( n.ledgerManager.processCatchupRep) == catchup_reply_counts[ n.name] # Greater than 0 batches were reverted by the slow node assert max( getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches)) > 0 # Bad network repaired slow_node.reset_delays_and_process_delayeds() # Make sure pool is functional sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, steward1, 5) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_catchup_not_triggered_if_another_in_progress( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, broken_node_and_others): """ A node misses 3pc messages and checkpoints during some period but later it stashes some amount of checkpoints and starts catchup. When the node is performing the catchup, it receives more checkpoints enough to start a new catchup but it does not start it because the first catchup is in progress. """ max_batch_size = chkFreqPatched.Max3PCBatchSize broken_node, other_nodes = broken_node_and_others logger.info("Step 1: The node misses quite a lot of 3PC-messages and checkpoints") send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint + max_batch_size) waitNodeDataInequality(looper, broken_node, *other_nodes) logger.info( "Step 2: The node receives 3PC-messages but cannot process them because of " "missed ones. But the node eventually stashes some amount of checkpoints " "and after that starts catchup") repaired_node = repair_broken_node(broken_node) initial_do_start_catchup_times = repaired_node.spylog.count(Node._do_start_catchup) initial_all_ledgers_caught_up = repaired_node.spylog.count(Node.allLedgersCaughtUp) with delay_rules(repaired_node.nodeIbStasher, cr_delay()): send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint - max_batch_size) ensure_all_nodes_have_same_data(looper, other_nodes) target_ledger_size = other_nodes[0].domainLedger.size looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.syncing), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)) + waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 logger.info( "Step 3: While doing the catchup, the node receives new checkpoints " "enough to start a new catchup but the node does not start it because " "the former is in progress") process_checkpoint_times_before = repaired_node.master_replica.spylog.count(Replica.processCheckpoint) send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * reqs_for_checkpoint) # Wait until the node receives the new checkpoints from all the other nodes looper.run( eventually(lambda: assertExp(repaired_node.master_replica.spylog.count(Replica.processCheckpoint) - process_checkpoint_times_before == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * (len(txnPoolNodeSet) - 1)), timeout=waits.expectedPoolInterconnectionTime(len(txnPoolNodeSet)))) # New catchup is not started when another one is in progress assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.mode == Mode.syncing logger.info("Step 4: The node completes the catchup. The ledger has been " "updated to the level determined on its start") looper.run(eventually(lambda: assertExp(repaired_node.mode == Mode.participating), timeout=waits.expectedPoolCatchupTime(len(txnPoolNodeSet)))) assert repaired_node.spylog.count(Node._do_start_catchup) - initial_do_start_catchup_times == 1 assert repaired_node.spylog.count(Node.allLedgersCaughtUp) - initial_all_ledgers_caught_up == 1 assert repaired_node.domainLedger.size == target_ledger_size
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) start_view_change start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) slow_node.nodeIbStasher.delay(cr_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[ 0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node stashed commits assert slow_node.master_replica.stasher.num_stashed_catchup == \ sent_batches * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CATCHUP_REP) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed messages successfully assert slow_node.master_replica.stasher.num_stashed_catchup == 0 looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, chkFreqPatched): ''' Test that the lagging_node can process messages from catchup stash after catchup and request lost messages from other nodes. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Order 2 checkpoints on rest_nodes (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) with delay_rules(lagging_node.nodeIbStasher, cs_delay(), msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])): with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) # Check that firs txn was ordered from stash after first catchup looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[ 1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1), retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n. master_last_ordered_3PC for n in txnPoolNodeSet))) # check that catch-up was started only twice assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_node_catchup_after_restart_with_txns( sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf, sdk_node_set_with_node_added_after_some_txns, allPluginsPath): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Stopping node {} with pool ledger size {}".format( new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") more_requests = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, more_requests) logger.debug("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) newNode = TestNode(new_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode # Make sure ledger is not synced initially check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced) # Delay catchup reply processing so LedgerState does not change # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state' # fails because newNode's domain ledger state is 'synced' delay_catchup_reply = 10 newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run( eventually(check_ledger_state, newNode, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) confused_node = txnPoolNodeSet[0] new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID] cp = new_node_ledger.catchUpTill start, end = cp.seqNoStart, cp.seqNoEnd cons_proof = confused_node.ledgerManager._buildConsistencyProof( DOMAIN_LEDGER_ID, start, end) bad_send_time = None def chk(): nonlocal bad_send_time entries = newNode.ledgerManager.spylog.getAll( newNode.ledgerManager.canProcessConsistencyProof.__name__) for entry in entries: # `canProcessConsistencyProof` should return False after `syncing_time` if entry.result == False and entry.starttime > bad_send_time: return assert False def send_and_chk(ledger_state): nonlocal bad_send_time, cons_proof bad_send_time = perf_counter() confused_node.ledgerManager.sendTo(cons_proof, newNode.name) # Check that the ConsistencyProof messages rejected looper.run(eventually(chk, retryWait=.5, timeout=5)) check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state) send_and_chk(LedgerState.syncing) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1], customTimeout=timeout) assert new_node_ledger.num_txns_caught_up == more_requests send_and_chk(LedgerState.synced)
def test_slow_node_reverts_unordered_state_during_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay COMMITs to a node such that when it needs to catchup, it needs to revert some unordered state. Also till this time the node should have receive all COMMITs such that it will apply some of the COMMITs ( for which it has not received txns from catchup). For this delay COMMITs by long, do catchup for a little older than the state received in LedgerStatus, once catchup completes, reset delays and try to process delayed COMMITs, some COMMITs will be rejected but some will be processed since catchup was done for older ledger. """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3 * Max3PCBatchSize) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_master_replica = slow_node.master_replica commit_delay = 150 catchup_rep_delay = 25 # Delay COMMITs to one node slow_node.nodeIbStasher.delay(cDelay(commit_delay, 0)) # Delay LEDGER_STAUS on slow node, so that only MESSAGE_REQUEST(LEDGER_STATUS) is sent, and the # node catch-ups 2 times. # Otherwise other nodes may receive multiple LEDGER_STATUSes from slow node, and return Consistency proof for all # missing txns, so no stashed ones are applied slow_node.nodeIbStasher.delay(lsDelay(1000)) # Make the slow node receive txns for a smaller ledger so it still finds # the need to catchup delay_batches = 2 make_a_node_catchup_less(slow_node, other_nodes, DOMAIN_LEDGER_ID, delay_batches * Max3PCBatchSize) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, other_nodes) waitNodeDataInequality(looper, slow_node, *other_nodes) old_lcu_count = slow_node.spylog.count(slow_node.allLedgersCaughtUp) # `slow_node` is slow to receive CatchupRep, so that it # gets a chance to order COMMITs slow_node.nodeIbStasher.delay(cr_delay(catchup_rep_delay)) # start view change (and hence catchup) ensure_view_change(looper, txnPoolNodeSet) # Check last ordered of `other_nodes` is same for n1, n2 in combinations(other_nodes, 2): lst_3pc = check_last_ordered_3pc(n1, n2) def chk1(): # `slow_node` has prepared all 3PC messages which # `other_nodes` have ordered assertEquality(slow_master_replica.last_prepared_before_view_change, lst_3pc) looper.run(eventually(chk1, retryWait=1)) old_pc_count = slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) assert len(slow_node.stashedOrderedReqs) == 0 # Repair the network so COMMITs are received, processed and stashed slow_node.reset_delays_and_process_delayeds(COMMIT) def chk2(): # COMMITs are processed for prepared messages assert slow_master_replica.spylog.count( slow_master_replica.can_process_since_view_change_in_progress) > old_pc_count looper.run(eventually(chk2, retryWait=1, timeout=5)) def chk3(): # COMMITs are stashed assert len(slow_node.stashedOrderedReqs) == delay_batches * Max3PCBatchSize looper.run(eventually(chk3, retryWait=1, timeout=15)) # fix catchup, so the node gets a chance to be caught-up repair_node_catchup_less(other_nodes) def chk4(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv looper.run(eventually(chk4, retryWait=1, timeout=catchup_rep_delay + 5)) def chk5(): # Catchup was done once assert slow_node.spylog.count( slow_node.allLedgersCaughtUp) > old_lcu_count looper.run( eventually( chk5, retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # make sure that the pool is functional checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)