def test_stashed_checkpoint_processing(chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One node in a pool of 5 nodes lags to order the last 3PC-batch in a checkpoint. By the moment when it eventually orders the 3PC-batch it has already received and stashed Checkpoint message from two node, so it processes these stashed messages on completing the checkpoint. After this it receives Checkpoint messages from two other nodes, processes them and stabilizes the checkpoint. """ epsilon = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) epsilon.nodeIbStasher.delay(cDelay()) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Gamma')) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Delta')) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 1) check_received_checkpoint_votes(replica, pp_seq_no=5, num_votes=2) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def check(): for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 1) check_last_checkpoint(replica, 5) check_num_received_checkpoints(replica, 1) check_last_received_checkpoint(replica, 5) looper.run( eventually(check, timeout=waits.expectedOrderingTime(len(txnPoolNodeSet)))) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 5) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 0)
def test_watermarks_after_view_change(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay commit, checkpoint, InstanceChange and ViewChangeDone messages for lagging_node. Start ViewChange. Check that ViewChange finished. Reset delays. Check that lagging_node can order transactions and has same data with other nodes. """ lagging_node = txnPoolNodeSet[-1] lagging_node.master_replica.config.LOG_SIZE = LOG_SIZE start_view_no = lagging_node.viewNo with delay_rules(lagging_node.nodeIbStasher, cDelay(), chk_delay(), icDelay(), nv_delay()): trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet[:-1], expectedViewNo=start_view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet[:-1]) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_freeing_forwarded_not_preprepared_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): behind_node = txnPoolNodeSet[-1] behind_node.requests.clear() sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules( behind_node.nodeIbStasher, chk_delay(delay=sys.maxsize, instId=behind_node.replicas.values()[-1])): with delay_rules(behind_node.nodeIbStasher, ppDelay(delay=sys.maxsize), pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run( eventually(node_caughtup, behind_node, count, retryWait=1)) looper.run( eventually( lambda: assertExp(len(behind_node.requests) == req_num))) # We execute caughtup requests looper.run( eventually(lambda: assertExp(len(behind_node.requests) == req_num))) assert all(r.executed for r in behind_node.requests.values() if behind_node.seqNoDB.get(r.request.key)[1])
def test_backup_replica_resumes_ordering_on_lag_if_checkpoints_belate( looper, chkFreqPatched, reqs_for_checkpoint, one_replica_and_others_in_backup_instance, sdk_pool_handle, sdk_wallet_client, view_change_done): """ Verifies resumption of ordering 3PC-batches on a backup replica on detection of a lag in checkpoints in case it is detected after some batch in the next checkpoint has already been committed but cannot be ordered out of turn """ slow_replica, other_replicas = one_replica_and_others_in_backup_instance view_no = slow_replica.viewNo # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, 2)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Don't receive Commits from two replicas slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[0].node.name)) slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[1].node.name)) # Send a request for which the replica will not be able to order the batch # due to an insufficient count of Commits sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Receive further Commits from now on slow_replica.node.nodeIbStasher.drop_delayeds() slow_replica.node.nodeIbStasher.resetDelays() looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, 2)), slow_replica, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Send requests but in a quantity insufficient # for catch-up number of checkpoints sdk_send_random_requests( looper, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint - 2) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Don't receive Checkpoints slow_replica.node.nodeIbStasher.delay(chk_delay(instId=1)) # Send more requests to reach catch-up number of checkpoints sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) # Send a request that starts a new checkpoint sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that the replica has not ordered any batches # after the very first one assert slow_replica.last_ordered_3pc == (view_no, 2) # Ensure that the watermarks have not been shifted since the view start assert slow_replica.h == 0 assert slow_replica.H == LOG_SIZE # Ensure that there are some quorumed stashed checkpoints assert slow_replica.stashed_checkpoints_with_quorum() # Receive belated Checkpoints slow_replica.node.nodeIbStasher.reset_delays_and_process_delayeds() # Ensure that the replica has ordered the batch for the last sent request looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == \ (view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + 2)), slow_replica, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Ensure that the watermarks have been shifted so that the lower watermark # now equals to the end of the last stable checkpoint in the instance assert slow_replica.h == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ assert slow_replica.H == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE # Ensure that now there are no quorumed stashed checkpoints assert not slow_replica.stashed_checkpoints_with_quorum() # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(slow_replica.last_ordered_3pc == ( view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + 3)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount)))
def test_stashed_checkpoint_processing(chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One node in a pool of 5 nodes lags to order the last 3PC-batch in a checkpoint. By the moment when it eventually orders the 3PC-batch it has already received and stashed Checkpoint message from two node, so it processes these stashed messages on completing the checkpoint. After this it receives Checkpoint messages from two other nodes, processes them and stabilizes the checkpoint. """ epsilon = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) epsilon.nodeIbStasher.delay(cDelay()) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Gamma')) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Delta')) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): assert len(replica._checkpointer._checkpoint_state) == 1 assert (1, 5) in replica._checkpointer._checkpoint_state assert replica._checkpointer._checkpoint_state[(1, 5)].seqNo == 4 assert replica._checkpointer._checkpoint_state[(1, 5)].digest is None assert replica._checkpointer._checkpoint_state[(1, 5)].isStable is False assert len(replica._checkpointer._stashed_recvd_checkpoints) == 1 assert 0 in replica._checkpointer._stashed_recvd_checkpoints assert len(replica._checkpointer._stashed_recvd_checkpoints[0]) == 1 assert (1, 5) in replica._checkpointer._stashed_recvd_checkpoints[0] assert len(replica._checkpointer._stashed_recvd_checkpoints[0][(1, 5)]) == 2 epsilon.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def check(): for inst_id, replica in epsilon.replicas.items(): assert len(replica._checkpointer._checkpoint_state) == 1 assert (1, 5) in replica._checkpointer._checkpoint_state assert replica._checkpointer._checkpoint_state[(1, 5)].seqNo == 5 assert replica._checkpointer._checkpoint_state[(1, 5)].digest is not None assert replica._checkpointer._checkpoint_state[(1, 5)].isStable is False assert len(replica._checkpointer._stashed_recvd_checkpoints) == 0 looper.run(eventually(check, timeout=waits.expectedOrderingTime( len(txnPoolNodeSet)))) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): assert len(replica._checkpointer._checkpoint_state) == 1 assert (1, 5) in replica._checkpointer._checkpoint_state assert replica._checkpointer._checkpoint_state[(1, 5)].seqNo == 5 assert replica._checkpointer._checkpoint_state[(1, 5)].digest is not None assert replica._checkpointer._checkpoint_state[(1, 5)].isStable is True assert len(replica._checkpointer._stashed_recvd_checkpoints) == 0
def test_stashed_messages_processed_on_backup_replica_ordering_resumption( looper, chkFreqPatched, reqs_for_checkpoint, one_replica_and_others_in_backup_instance, sdk_pool_handle, sdk_wallet_client, view_change_done, txnPoolNodeSet): """ Verifies resumption of ordering 3PC-batches on a backup replica on detection of a lag in checkpoints in case it is detected after some 3PC-messages related to the next checkpoint have already been stashed as laying outside of the watermarks. Please note that to verify this case the config is set up so that LOG_SIZE == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ """ global first_run batches_count = get_pp_seq_no(txnPoolNodeSet) slow_replica, other_replicas = one_replica_and_others_in_backup_instance view_no = slow_replica.viewNo low_watermark = slow_replica.h # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, batches_count)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Don't receive Commits from two replicas slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[0].node.name)) slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[1].node.name)) slow_replica.node.nodeIbStasher.delay( msg_rep_delay(types_to_delay=[COMMIT]) ) # Send a request for which the replica will not be able to order the batch # due to an insufficient count of Commits sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Receive further Commits from now on slow_replica.node.nodeIbStasher.drop_delayeds() slow_replica.node.nodeIbStasher.resetDelays() # Send requests but in a quantity insufficient # for catch-up number of checkpoints reqs_until_checkpoints = reqs_for_checkpoint - get_pp_seq_no([r.node for r in other_replicas]) % reqs_for_checkpoint sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_until_checkpoints) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Don't receive Checkpoints slow_replica.node.nodeIbStasher.delay(chk_delay(instId=1)) # Send more requests to reach catch-up number of checkpoints sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that there are no 3PC-messages stashed # as laying outside of the watermarks assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == 0 # Send a request for which the batch will be outside of the watermarks sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that the replica has not ordered any batches # after the very first one assert slow_replica.last_ordered_3pc == (view_no, batches_count) # Ensure that the watermarks have not been shifted since the view start assert slow_replica.h == low_watermark assert slow_replica.H == (sys.maxsize if first_run else low_watermark + LOG_SIZE) # Ensure that there are some quorumed stashed checkpoints check_num_quorumed_received_checkpoints(slow_replica, 1) # Ensure that now there are 3PC-messages stashed # as laying outside of the watermarks if not first_run: assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == incoming_3pc_msgs_count(len(txnPoolNodeSet)) # Receive belated Checkpoints slow_replica.node.nodeIbStasher.reset_delays_and_process_delayeds() batches_count = get_pp_seq_no([r.node for r in other_replicas]) # Ensure that the replica has ordered the batch for the last sent request looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, batches_count)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Ensure that the watermarks have been shifted so that the lower watermark # now equals to the end of the last stable checkpoint in the instance assert slow_replica.h == low_watermark + (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ assert slow_replica.H == low_watermark + (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE # Ensure that now there are no quorumed stashed checkpoints check_num_quorumed_received_checkpoints(slow_replica, 0) # Ensure that now there are no 3PC-messages stashed # as laying outside of the watermarks assert slow_replica.stasher.stash_size(STASH_WATERMARKS) == 0 # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, batches_count)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) first_run = False
def test_non_primary_recvs_3phase_message_outside_watermarks( chkFreqPatched, reqs_for_logsize, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow in receiving PRE-PREPAREs and PREPAREs. A lot of requests are sent and the slow node has started receiving COMMITs outside of its watermarks and so stashes them. Also this node is slow in receiving CHECKPOINTs. So a catch-up does not occur on it. Then the slow node eventually receives the sent PRE-PREPAREs and PREPAREs and so orders the 3PC-batches between its watermarks. The other nodes discard the COMMITs from the slow node since they have already achieved stable checkpoints for these COMMITs. After that the slow node eventually receives the sent CHECKPOINTs from the other nodes and so stabilizes own completed checkpoints and updates its watermarks. A catch-up is not triggered because no received checkpoints are stashed. Since now the watermarks have been updated, the slow node processes 3PC-messages stashed earlier and its ledger becomes equal to the ledgers of the other nodes. """ backupInstId = 1 npr = getNonPrimaryReplicas(txnPoolNodeSet, backupInstId) slowReplica = npr[0] slowNode = slowReplica.node slowNode.nodeIbStasher.delay(ppDelay(300, backupInstId)) slowNode.nodeIbStasher.delay(pDelay(300, backupInstId)) slowNode.nodeIbStasher.delay(chk_delay(300)) initialDomainLedgerSize = slowNode.domainLedger.size oldStashCount = slowReplica.stasher.num_stashed_watermarks slowReplica.H = LOG_SIZE # 1. Send requests more than fit between the watermarks on the slow node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_logsize + 2) # Verify that the slow node stashes the batches outside of its watermarks newStashCount = slowReplica.stasher.num_stashed_watermarks assert newStashCount > oldStashCount oldDiscardCounts = discardCounts([n.replicas[backupInstId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') # 2. Deliver the sent PREPREPAREs and PREPAREs to the slow node slowNode.nodeIbStasher.reset_delays_and_process_delayeds(PREPREPARE, PREPARE) # Verify that the slow node orders the 3PC-batches between its watermarks # but no more. looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet))) checkNodeDataForInequality(slowNode, *[n for n in txnPoolNodeSet if n != slowNode]) assert slowNode.domainLedger.size - initialDomainLedgerSize == reqs_for_logsize # Also verify that the other nodes discard the COMMITs from the slow node # since they have already achieved stable checkpoints for these COMMITs. counts = discardCounts( [n.replicas[backupInstId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') for nm, count in counts.items(): assert count > oldDiscardCounts[nm] oldCatchupTimes = slowNode.spylog.count(Node.start_catchup) # 3. Deliver the sent CHECKPOINTs to the slow node slowNode.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) # Verify that the slow node processes 3PC-messages stashed earlier and its # ledger becomes equal to the ledgers of the other nodes while a catch-up # is not made. waitNodeDataEquality(looper, slowNode, *[n for n in txnPoolNodeSet if n != slowNode]) assert slowNode.domainLedger.size - initialDomainLedgerSize == reqs_for_logsize + 2 newCatchupTimes = slowNode.spylog.count(Node.start_catchup) assert newCatchupTimes == oldCatchupTimes
def test_stashed_messages_processed_on_backup_replica_ordering_resumption( looper, chkFreqPatched, reqs_for_checkpoint, one_replica_and_others_in_backup_instance, sdk_pool_handle, sdk_wallet_client, view_change_done): """ Verifies resumption of ordering 3PC-batches on a backup replica on detection of a lag in checkpoints in case it is detected after some 3PC-messages related to the next checkpoint have already been stashed as laying outside of the watermarks. Please note that to verify this case the config is set up so that LOG_SIZE == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ """ slow_replica, other_replicas = one_replica_and_others_in_backup_instance view_no = slow_replica.viewNo # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(slow_replica.last_ordered_3pc == (view_no, 1)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Don't receive Commits from two replicas slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[0].node.name)) slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[1].node.name)) # Send a request for which the replica will not be able to order the batch # due to an insufficient count of Commits sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Receive further Commits from now on slow_replica.node.nodeIbStasher.drop_delayeds() slow_replica.node.nodeIbStasher.resetDelays() # Send requests but in a quantity insufficient # for catch-up number of checkpoints sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint - 2) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Don't receive Checkpoints slow_replica.node.nodeIbStasher.delay(chk_delay(instId=1)) # Send more requests to reach catch-up number of checkpoints sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that there are no 3PC-messages stashed # as laying outside of the watermarks assert not slow_replica.stashingWhileOutsideWaterMarks # Send a request for which the batch will be outside of the watermarks sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that the replica has not ordered any batches # after the very first one assert slow_replica.last_ordered_3pc == (view_no, 1) # Ensure that the watermarks have not been shifted since the view start assert slow_replica.h == 0 assert slow_replica.H == LOG_SIZE # Ensure that there are some quorumed stashed checkpoints assert slow_replica.stashed_checkpoints_with_quorum() # Ensure that now there are 3PC-messages stashed # as laying outside of the watermarks assert slow_replica.stashingWhileOutsideWaterMarks # Receive belated Checkpoints slow_replica.node.nodeIbStasher.reset_delays_and_process_delayeds() # Ensure that the replica has ordered the batch for the last sent request looper.run( eventually(lambda: assertExp(slow_replica.last_ordered_3pc == (view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + 1)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Ensure that the watermarks have been shifted so that the lower watermark # now equals to the end of the last stable checkpoint in the instance assert slow_replica.h == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ assert slow_replica.H == (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE # Ensure that now there are no quorumed stashed checkpoints assert not slow_replica.stashed_checkpoints_with_quorum() # Ensure that now there are no 3PC-messages stashed # as laying outside of the watermarks assert not slow_replica.stashingWhileOutsideWaterMarks # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(slow_replica.last_ordered_3pc == (view_no, (Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + 2)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount)))
def test_non_primary_recvs_3phase_message_outside_watermarks( chkFreqPatched, reqs_for_logsize, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow in receiving PRE-PREPAREs and PREPAREs. A lot of requests are sent and the slow node has started receiving COMMITs outside of its watermarks and so stashes them. Also this node is slow in receiving CHECKPOINTs. So a catch-up does not occur on it. Then the slow node eventually receives the sent PRE-PREPAREs and PREPAREs and so orders the 3PC-batches between its watermarks. The other nodes discard the COMMITs from the slow node since they have already achieved stable checkpoints for these COMMITs. After that the slow node eventually receives the sent CHECKPOINTs from the other nodes and so stabilizes own completed checkpoints and updates its watermarks. A catch-up is not triggered because no received checkpoints are stashed. Since now the watermarks have been updated, the slow node processes 3PC-messages stashed earlier and its ledger becomes equal to the ledgers of the other nodes. """ backupInstId = 1 npr = getNonPrimaryReplicas(txnPoolNodeSet, backupInstId) slowReplica = npr[0] slowNode = slowReplica.node slowNode.nodeIbStasher.delay(ppDelay(300, backupInstId)) slowNode.nodeIbStasher.delay(pDelay(300, backupInstId)) slowNode.nodeIbStasher.delay(chk_delay(300)) initialDomainLedgerSize = slowNode.domainLedger.size oldStashCount = slowReplica.spylog.count(TestReplica.stashOutsideWatermarks.__name__) # 1. Send requests more than fit between the watermarks on the slow node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_logsize + 2) # Verify that the slow node stashes the batches outside of its watermarks newStashCount = slowReplica.spylog.count(TestReplica.stashOutsideWatermarks.__name__) assert newStashCount > oldStashCount oldDiscardCounts = discardCounts([n.replicas[backupInstId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') # 2. Deliver the sent PREPREPAREs and PREPAREs to the slow node slowNode.nodeIbStasher.reset_delays_and_process_delayeds(PREPREPARE, PREPARE) # Verify that the slow node orders the 3PC-batches between its watermarks # but no more. looper.runFor(waits.expectedTransactionExecutionTime(len(txnPoolNodeSet))) checkNodeDataForInequality(slowNode, *[n for n in txnPoolNodeSet if n != slowNode]) assert slowNode.domainLedger.size - initialDomainLedgerSize == reqs_for_logsize # Also verify that the other nodes discard the COMMITs from the slow node # since they have already achieved stable checkpoints for these COMMITs. counts = discardCounts( [n.replicas[backupInstId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') for nm, count in counts.items(): assert count > oldDiscardCounts[nm] oldCatchupTimes = slowNode.spylog.count(Node.start_catchup) # 3. Deliver the sent CHECKPOINTs to the slow node slowNode.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) # Verify that the slow node processes 3PC-messages stashed earlier and its # ledger becomes equal to the ledgers of the other nodes while a catch-up # is not made. waitNodeDataEquality(looper, slowNode, *[n for n in txnPoolNodeSet if n != slowNode]) assert slowNode.domainLedger.size - initialDomainLedgerSize == reqs_for_logsize + 2 newCatchupTimes = slowNode.spylog.count(Node.start_catchup) assert newCatchupTimes == oldCatchupTimes
def test_process_three_phase_msg_and_stashed_for_next_checkpoint( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, chkFreqPatched): """ 1. Delay checkpoints processing on the slow_node. That is checkpoint on this node can't be finalized. 2. Order requests for finalize checkpoints. 3. Check that a checkpoint is finalized on all nodes exclude the slow_node. 4. Order a new request. 5. Check that slow_node could not order this request and stashed all 3pc messages. 6. Reset delays. 7. Check that the last request is ordered on the slow_node, checkpoint is finalized and stashed messages were removed. """ for n in txnPoolNodeSet: for r in n.replicas.values(): r.update_watermark_from_3pc() slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] old_stashed = { inst_id: r.stasher.num_stashed_watermarks for inst_id, r in slow_node.replicas.items() } last_ordered = { inst_id: r.last_ordered_3pc for inst_id, r in slow_node.replicas.items() } with delay_rules([ slow_node.nodeIbStasher, ], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): with delay_rules([ slow_node.nodeIbStasher, ], chk_delay()): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * CHK_FREQ, num_batches=CHK_FREQ) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) looper.run( eventually(_check_checkpoint_finalize, fast_nodes, 1, CHK_FREQ)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stashed_messages = incoming_3pc_msgs_count(len(txnPoolNodeSet)) assert all( r.stasher.num_stashed_watermarks == old_stashed[inst_id] + stashed_messages for inst_id, r in slow_node.replicas.items()) _check_batches_ordered(slow_node, last_ordered, CHK_FREQ) for n in fast_nodes: _check_batches_ordered(n, last_ordered, CHK_FREQ + 1) looper.run( eventually(_check_checkpoint_finalize, [ slow_node, ], 1, CHK_FREQ)) looper.run( eventually(_check_batches_ordered, slow_node, last_ordered, CHK_FREQ + 1)) assert all(r.stasher.num_stashed_watermarks == old_stashed[inst_id] for inst_id, r in slow_node.replicas.items())
def test_stashed_checkpoint_processing(chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One node in a pool of 5 nodes lags to order the last 3PC-batch in a checkpoint. By the moment when it eventually orders the 3PC-batch it has already received and stashed Checkpoint message from two node, so it processes these stashed messages on completing the checkpoint. After this it receives Checkpoint messages from two other nodes, processes them and stabilizes the checkpoint. """ epsilon = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) epsilon.nodeIbStasher.delay(cDelay()) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Gamma')) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Delta')) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): assert len(replica.checkpoints) == 1 assert (1, 5) in replica.checkpoints assert replica.checkpoints[(1, 5)].seqNo == 4 assert replica.checkpoints[(1, 5)].digest is None assert replica.checkpoints[(1, 5)].isStable is False assert len(replica.stashedRecvdCheckpoints) == 1 assert 0 in replica.stashedRecvdCheckpoints assert len(replica.stashedRecvdCheckpoints[0]) == 1 assert (1, 5) in replica.stashedRecvdCheckpoints[0] assert len(replica.stashedRecvdCheckpoints[0][(1, 5)]) == 2 epsilon.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def check(): for inst_id, replica in epsilon.replicas.items(): assert len(replica.checkpoints) == 1 assert (1, 5) in replica.checkpoints assert replica.checkpoints[(1, 5)].seqNo == 5 assert replica.checkpoints[(1, 5)].digest is not None assert replica.checkpoints[(1, 5)].isStable is False assert len(replica.stashedRecvdCheckpoints) == 0 looper.run(eventually(check, timeout=waits.expectedOrderingTime( len(txnPoolNodeSet)))) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): assert len(replica.checkpoints) == 1 assert (1, 5) in replica.checkpoints assert replica.checkpoints[(1, 5)].seqNo == 5 assert replica.checkpoints[(1, 5)].digest is not None assert replica.checkpoints[(1, 5)].isStable is True assert len(replica.stashedRecvdCheckpoints) == 0