def test_second_checkpoint_after_catchup_can_be_stabilized( chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_steward, sdk_wallet_client, sdk_pool_handle, tdir, tconf, allPluginsPath): _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Epsilon did not participate in ordering of the batch with EpsilonSteward # NYM transaction and the batch with Epsilon NODE transaction. # Epsilon got these transactions via catch-up. master_replica = new_node.replicas._master_replica check_stable_checkpoint(master_replica, 0) check_num_received_checkpoints(master_replica, 0) assert master_replica.h == 2 assert master_replica.H == 17 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for replica in new_node.replicas.values(): assert replica.h == 2 assert replica.H == 17 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for replica in new_node.replicas.values(): check_stable_checkpoint(replica, 5) check_num_unstable_checkpoints(replica, 0) # nothing is stashed since it's ordered during catch-up check_num_received_checkpoints(replica, 0) assert replica.h == 5 assert replica.H == 20 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(stabilization_timeout) for replica in new_node.replicas.values(): check_stable_checkpoint(replica, 10) check_num_unstable_checkpoints(replica, 0) # nothing is stashed since it's ordered during catch-up check_num_received_checkpoints(replica, 0) assert replica.h == 10 assert replica.H == 25
def check(): for replica in slow_node.replicas.values(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 1) check_received_checkpoint_votes(replica, pp_seq_no=5, num_votes=len(txnPoolNodeSet) - 1)
def check(): for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 1) check_last_checkpoint(replica, 5) check_num_received_checkpoints(replica, 1) check_last_received_checkpoint(replica, 5)
def test_stashed_checkpoint_processing(chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One node in a pool of 5 nodes lags to order the last 3PC-batch in a checkpoint. By the moment when it eventually orders the 3PC-batch it has already received and stashed Checkpoint message from two node, so it processes these stashed messages on completing the checkpoint. After this it receives Checkpoint messages from two other nodes, processes them and stabilizes the checkpoint. """ epsilon = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) epsilon.nodeIbStasher.delay(cDelay()) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Gamma')) epsilon.nodeIbStasher.delay(chk_delay(sender_filter='Delta')) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 1) check_received_checkpoint_votes(replica, pp_seq_no=5, num_votes=2) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def check(): for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 1) check_last_checkpoint(replica, 5) check_num_received_checkpoints(replica, 1) check_last_received_checkpoint(replica, 5) looper.run( eventually(check, timeout=waits.expectedOrderingTime(len(txnPoolNodeSet)))) epsilon.nodeIbStasher.reset_delays_and_process_delayeds(CHECKPOINT) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for inst_id, replica in epsilon.replicas.items(): check_stable_checkpoint(replica, 5) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 0)
def test_second_checkpoint_after_catchup_can_be_stabilized( chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_steward, sdk_wallet_client, sdk_pool_handle, tdir, tconf, allPluginsPath): lagging_node = txnPoolNodeSet[-1] with delay_rules_without_processing(lagging_node.nodeIbStasher, cDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf.Max3PCBatchSize * CHK_FREQ * 2) waitNodeDataEquality(looper, lagging_node, *txnPoolNodeSet[:-1]) # Epsilon got lost transactions via catch-up. sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) master_replica = lagging_node.master_replica check_stable_checkpoint(master_replica, 10) check_num_received_checkpoints(master_replica, 0) assert master_replica.h == 10 assert master_replica.H == 25 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) for replica in lagging_node.replicas.values(): assert replica.h == 10 assert replica.H == 25 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.runFor(stabilization_timeout) for replica in lagging_node.replicas.values(): check_stable_checkpoint(replica, 15) check_num_unstable_checkpoints(replica, 0) # nothing is stashed since it's ordered during catch-up check_num_received_checkpoints(replica, 0) assert replica.h == 15 assert replica.H == 30 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(stabilization_timeout) for replica in lagging_node.replicas.values(): check_stable_checkpoint(replica, 20) check_num_unstable_checkpoints(replica, 0) # nothing is stashed since it's ordered during catch-up check_num_received_checkpoints(replica, 0) assert replica.h == 20 assert replica.H == 35
def test_lagged_checkpoint_completion(chkFreqPatched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One node in a pool lags to order the last 3PC-batch in a checkpoint so that when it eventually orders this 3PC-batch and thus completes the checkpoint it has already received and stashed the corresponding checkpoint messages from all the other nodes. The test verifies that the node successfully processes the stashed checkpoint messages and stabilizes the checkpoint. """ slow_node = txnPoolNodeSet[-1] # All the nodes in the pool normally orders all the 3PC-batches in a # checkpoint except the last 3PC-batch. The last 3PC-batch in the # checkpoint is ordered by all the nodes except one slow node because this # node lags to receive Commits. sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) slow_node.nodeIbStasher.delay(cDelay()) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # All the other nodes complete the checkpoint and send Checkpoint messages # to others. The slow node receives and stashes these messages because it # has not completed the checkpoint. def check(): for replica in slow_node.replicas.values(): check_stable_checkpoint(replica, 0) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 1) check_received_checkpoint_votes(replica, pp_seq_no=5, num_votes=len(txnPoolNodeSet) - 1) stabilization_timeout = \ waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) looper.run(eventually(check, timeout=stabilization_timeout)) # Eventually the slow node receives Commits, orders the last 3PC-batch in # the checkpoint and thus completes it, processes the stashed checkpoint # messages and stabilizes the checkpoint. slow_node.nodeIbStasher.reset_delays_and_process_delayeds() looper.runFor(waits.expectedOrderingTime(len(txnPoolNodeSet))) for replica in slow_node.replicas.values(): check_stable_checkpoint(replica, 5) check_num_unstable_checkpoints(replica, 0) check_num_received_checkpoints(replica, 0)
def test_backup_replica_resumes_ordering_on_lag_in_checkpoints( looper, chkFreqPatched, reqs_for_checkpoint, one_replica_and_others_in_backup_instance, sdk_pool_handle, sdk_wallet_client, view_change_done, txnPoolNodeSet): """ Verifies resumption of ordering 3PC-batches on a backup replica on detection of a lag in checkpoints """ slow_replica, other_replicas = one_replica_and_others_in_backup_instance view_no = slow_replica.viewNo batches_count = slow_replica.last_ordered_3pc[1] # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 low_watermark = slow_replica.h looper.run( eventually(lambda: assert_eq(slow_replica.last_ordered_3pc, (view_no, batches_count)), retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Don't receive Commits from two replicas slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[0].node.name)) slow_replica.node.nodeIbStasher.delay( cDelay(instId=1, sender_filter=other_replicas[1].node.name)) # Send a request for which the replica will not be able to order the batch # due to an insufficient count of Commits sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Recover reception of Commits slow_replica.node.nodeIbStasher.drop_delayeds() slow_replica.node.nodeIbStasher.resetDelays() # Send requests but in a quantity insufficient # for catch-up number of checkpoints reqs_until_checkpoints = reqs_for_checkpoint - other_replicas[ 0].last_ordered_3pc[1] sdk_send_random_requests( looper, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_until_checkpoints) looper.runFor(waits.expectedTransactionExecutionTime(nodeCount)) # Ensure that the replica has not ordered any batches # after the very first one assert slow_replica.last_ordered_3pc == (view_no, batches_count) # Ensure that the watermarks have not been shifted since the view start assert slow_replica.h == low_watermark assert slow_replica.H == low_watermark + LOG_SIZE # Ensure that the collections related to requests, batches and # own checkpoints are not empty. # (Note that a primary replica removes requests from requestQueues # when creating a batch with them.) if slow_replica.isPrimary: assert slow_replica._ordering_service.sent_preprepares else: assert slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID] assert slow_replica._ordering_service.prePrepares assert slow_replica._ordering_service.prepares assert slow_replica._ordering_service.commits assert slow_replica._ordering_service.batches check_num_unstable_checkpoints(slow_replica, 0) check_num_quorumed_received_checkpoints(slow_replica, 1) # Send more requests to reach catch-up number of checkpoints sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) batches_count += 1 batches_count += reqs_until_checkpoints batches_count += reqs_for_checkpoint # Ensure that the replica has adjusted last_ordered_3pc to the end # of the last checkpoint looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == \ (view_no, batches_count)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) # Ensure that the watermarks have been shifted so that the lower watermark # has the same value as last_ordered_3pc assert slow_replica.h == low_watermark + ( Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ assert slow_replica.H == low_watermark + ( Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1) * CHK_FREQ + LOG_SIZE # Ensure that the collections related to requests, batches and # own checkpoints have been cleared assert not slow_replica._ordering_service.requestQueues[DOMAIN_LEDGER_ID] assert not slow_replica._ordering_service.sent_preprepares assert not slow_replica._ordering_service.prePrepares assert not slow_replica._ordering_service.prepares assert not slow_replica._ordering_service.commits assert not slow_replica._ordering_service.batches check_num_unstable_checkpoints(slow_replica, 0) check_num_quorumed_received_checkpoints(slow_replica, 0) # Send a request and ensure that the replica orders the batch for it sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 looper.run( eventually(lambda *args: assertExp(slow_replica.last_ordered_3pc == (view_no, batches_count)), slow_replica, retryWait=1, timeout=waits.expectedTransactionExecutionTime(nodeCount))) slow_replica._checkpointer._received_checkpoints.clear() batches_count = get_pp_seq_no(txnPoolNodeSet)