def test_bls_not_depend_on_node_reg(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) node = txnPoolNodeSet[2] last_pre_prepare = \ node.master_replica._ordering_service.prePrepares[node.master_replica.last_ordered_3pc] bls = getattr(last_pre_prepare, f.BLS_MULTI_SIG.nm) # Get random participant node_name = next(iter(bls[1])) # We've removed one of the nodes from another node's log HA = deepcopy(node.nodeReg[node_name]) del node.nodeReg[node_name] state_root_hash = get_last_ordered_state_root_hash(node) node.master_replica._bls_bft_replica._bls_bft.bls_key_register._load_keys_for_root( state_root_hash) # Still we can validate Preprepare assert node.master_replica._bls_bft_replica._bls_bft.bls_key_register.get_key_by_name( node_name) node.nodeReg[node_name] = HA
def test_clearing_forwarded_preprepared_request(looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): # Case when backup ordered correctly, but primary had problems. # As a result, master will execute caughtup txns and will be removed # from requests queues behind_node = txnPoolNodeSet[-1] sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules(behind_node.nodeIbStasher, pDelay(delay=sys.maxsize, instId=0), cDelay(delay=sys.maxsize, instId=0)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) assert len(behind_node.requests) == 0 assert all([ len(q) == 0 for r in behind_node.replicas.values() for q in r.requestQueues.values() ]) assert len(behind_node.clientAuthNr._verified_reqs) == 0 assert len(behind_node.requestSender) == 0
def test_commits_recvd_first(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=20, num_batches=4) assert not slow_node.master_replica.prePrepares assert not slow_node.master_replica.prepares assert not slow_node.master_replica.commits assert len(slow_node.master_replica.commitsWaitingForPrepare) > 0 slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list( [n.master_replica.ordered for n in txnPoolNodeSet]) assert slow_node.master_replica.prePrepares assert slow_node.master_replica.prepares assert slow_node.master_replica.commits assert not slow_node.master_replica.commitsWaitingForPrepare
def test_freeing_forwarded_not_preprepared_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): behind_node = txnPoolNodeSet[-1] behind_node.requests.clear() sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules( behind_node.nodeIbStasher, chk_delay(delay=sys.maxsize, instId=behind_node.replicas.values()[-1])): with delay_rules(behind_node.nodeIbStasher, ppDelay(delay=sys.maxsize), pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run( eventually(node_caughtup, behind_node, count, retryWait=1)) looper.run( eventually( lambda: assertExp(len(behind_node.requests) == req_num))) # We execute caughtup requests looper.run( eventually(lambda: assertExp(len(behind_node.requests) == req_num))) assert all(r.executed for r in behind_node.requests.values() if behind_node.seqNoDB.get(r.request.key)[1])
def test_deletion_non_forwarded_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): behind_node = txnPoolNodeSet[-1] [behind_node.replicas.values()[1].discard_req_key(1, key) for key in behind_node.requests] behind_node.requests.clear() sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) behind_node.quorums.propagate = Quorum(len(txnPoolNodeSet) + 1) with delay_rules(behind_node.nodeIbStasher, ppDelay(delay=sys.maxsize), pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) # We clear caughtup requests looper.run(eventually(lambda: assertExp(len(behind_node.requests) == 0))) assert all([len(q) == 0 for r in behind_node.replicas.values() for q in r._ordering_service.requestQueues.values()]) assert len(behind_node.clientAuthNr._verified_reqs) == 0 assert len(behind_node.requestSender) == 0
def test_freeing_forwarded_preprepared_request(looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): # Case, when both backup and primary had problems behind_node = txnPoolNodeSet[-1] sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules( behind_node.nodeIbStasher, pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize), ): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) looper.run( eventually(lambda: assertExp(len(behind_node.requests) == req_num))) assert all(r.executed for r in behind_node.requests.values() if behind_node.seqNoDB.get(r.request.key)[1]) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) # Master and backup replicas do not stash new requests and succesfully order them assert len(behind_node.requests) == req_num
def test_freeing_forwarded_preprepared_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): # Case, when both backup and primary had problems behind_node = txnPoolNodeSet[-1] sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules(behind_node.nodeIbStasher, pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize), ): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) assert len(behind_node.requests) == req_num assert all(r.executed for r in behind_node.requests.values() if behind_node.seqNoDB.get(r.request.key)[1]) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) # Master and backup replicas do not stash new requests and succesfully order them assert len(behind_node.requests) == req_num
def test_dequeue_and_validate_commits(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 with delay_rules(slow_node.nodeIbStasher, pDelay(delay), msg_rep_delay(delay, [PREPARE, PREPREPARE])): with delay_rules(slow_node.nodeIbStasher, ppDelay(delay)): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1, num_batches=1) assert not slow_node.master_replica._ordering_service.prePrepares assert not slow_node.master_replica._ordering_service.prepares assert not slow_node.master_replica._ordering_service.commits assert len(slow_node.master_replica._ordering_service. commitsWaitingForPrepare) > 0 waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list([ n.master_replica._ordering_service.ordered for n in txnPoolNodeSet ]) assert slow_node.master_replica._ordering_service.prePrepares assert slow_node.master_replica._ordering_service.prepares assert slow_node.master_replica._ordering_service.commits assert not slow_node.master_replica._ordering_service.commitsWaitingForPrepare assert all(slow_node.master_replica.last_ordered_3pc == n.master_replica.last_ordered_3pc for n in other_nodes)
def test_deletion_non_forwarded_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): behind_node = txnPoolNodeSet[-1] [behind_node.replicas.values()[1].discard_req_key(1, key) for key in behind_node.requests] behind_node.requests.clear() sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) behind_node.quorums.propagate = Quorum(len(txnPoolNodeSet) + 1) with delay_rules(behind_node.nodeIbStasher, ppDelay(delay=sys.maxsize), pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) # We clear caughtup requests assert len(behind_node.requests) == 0 assert all([len(q) == 0 for r in behind_node.replicas.values() for q in r.requestQueues.values()]) assert len(behind_node.clientAuthNr._verified_reqs) == 0 assert len(behind_node.requestSender) == 0
def test_commits_recvd_first(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=20, num_batches=4) assert not slow_node.master_replica.prePrepares assert not slow_node.master_replica.prepares assert not slow_node.master_replica.commits assert len(slow_node.master_replica.commitsWaitingForPrepare) > 0 slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list([n.master_replica.ordered for n in txnPoolNodeSet]) assert slow_node.master_replica.prePrepares assert slow_node.master_replica.prepares assert slow_node.master_replica.commits assert not slow_node.master_replica.commitsWaitingForPrepare
def test_bls_not_depend_on_node_reg(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) node = txnPoolNodeSet[2] last_pre_prepare = \ node.master_replica.prePrepares[node.master_replica.last_ordered_3pc] bls = getattr(last_pre_prepare, f.BLS_MULTI_SIG.nm) # Get random participant node_name = next(iter(bls[1])) # We've removed one of the nodes from another node's log HA = deepcopy(node.nodeReg[node_name]) del node.nodeReg[node_name] state_root_hash = get_last_ordered_state_root_hash(node) node.master_replica._bls_bft_replica._bls_bft.bls_key_register._load_keys_for_root(state_root_hash) # Still we can validate Preprepare assert node.master_replica._bls_bft_replica._bls_bft.bls_key_register.get_key_by_name(node_name) node.nodeReg[node_name] = HA
def test_order_after_demote_and_restart(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath, sdk_wallet_stewards): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) primary_node = txnPoolNodeSet[0] node_to_stop = txnPoolNodeSet[1] node_to_demote = txnPoolNodeSet[2] txnPoolNodeSet.remove(node_to_demote) node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) ensure_node_disconnected(looper, node_to_stop, txnPoolNodeSet, timeout=2) demote_node(looper, sdk_wallet_stewards[2], sdk_pool_handle, node_to_demote) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_stop.nodestack.ha, cliha=node_to_stop.clientstack.ha) looper.add(restarted_node) txnPoolNodeSet[1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1, 1) def get_current_bls_keys(node): return node.master_replica._bls_bft_replica._bls_bft.bls_key_register._current_bls_keys assert get_current_bls_keys(restarted_node) == get_current_bls_keys(primary_node)
def test_no_preprepare_requested(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Node missing Propagates hence request not finalised, hence stashes PRE-PREPARE but does not request PRE-PREPARE on receiving PREPARE """ slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet) slow_node.nodeIbStasher.delay(ppgDelay(20)) slow_node.nodeIbStasher.delay(msg_rep_delay(20, [PROPAGATE, ])) old_count_resp = count_requested_preprepare_resp(slow_node) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=4, num_batches=2) # The slow node is behind checkNodeDataForInequality(slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp slow_node.nodeIbStasher.reset_delays_and_process_delayeds() # The slow node has processed all requests waitNodeDataEquality(looper, slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp
def test_no_preprepare_requested(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Node missing Propagates hence request not finalised, hence stashes PRE-PREPARE but does not request PRE-PREPARE on receiving PREPARE """ slow_node, other_nodes, _, _ = split_nodes(txnPoolNodeSet) slow_node.nodeIbStasher.delay(ppgDelay(20)) slow_node.nodeIbStasher.delay(msg_rep_delay(20, [ PROPAGATE, ])) old_count_resp = count_requested_preprepare_resp(slow_node) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=4, num_batches=2) # The slow node is behind checkNodeDataForInequality(slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp slow_node.nodeIbStasher.reset_delays_and_process_delayeds() # The slow node has processed all requests waitNodeDataEquality(looper, slow_node, *other_nodes) # PRE-PREPARE were not requested assert count_requested_preprepare_resp(slow_node) == old_count_resp
def test_handle_delayed_preprepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Make a node send PREPREPARE again after the slow node has ordered """ slow_node, other_nodes, primary_node, other_non_primary_nodes = \ split_nodes(txnPoolNodeSet) # This node will send PRE-PREPARE again orig_method = primary_node.handlers[PREPREPARE].serve last_pp = None def patched_method(self, msg): nonlocal last_pp last_pp = orig_method(msg) return last_pp primary_node.handlers[PREPREPARE].serve = types.MethodType(patched_method, primary_node.handlers[ PREPREPARE]) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) waitNodeDataEquality(looper, slow_node, *other_nodes) slow_master_replica = slow_node.master_replica count_pr_req = get_count(slow_master_replica, slow_master_replica.process_requested_pre_prepare) count_pr_tpc = get_count(slow_master_replica, slow_master_replica.processThreePhaseMsg) primary_node.sendToNodes(MessageRep(**{ f.MSG_TYPE.nm: PREPREPARE, f.PARAMS.nm: { f.INST_ID.nm: last_pp.instId, f.VIEW_NO.nm: last_pp.viewNo, f.PP_SEQ_NO.nm: last_pp.ppSeqNo }, f.MSG.nm: last_pp }), names=[slow_node.name, ]) def chk(): # `process_requested_pre_prepare` is called but # `processThreePhaseMsg` is not called assert get_count( slow_master_replica, slow_master_replica.process_requested_pre_prepare) > count_pr_req assert get_count( slow_master_replica, slow_master_replica.processThreePhaseMsg) == count_pr_tpc looper.run(eventually(chk, retryWait=1))
def test_order_after_demote_and_restart(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath, sdk_wallet_stewards): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) primary_node = txnPoolNodeSet[0] node_to_stop = txnPoolNodeSet[1] node_to_demote = txnPoolNodeSet[2] txnPoolNodeSet.remove(node_to_demote) node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) ensure_node_disconnected(looper, node_to_stop, txnPoolNodeSet, timeout=2) demote_node(looper, sdk_wallet_stewards[2], sdk_pool_handle, node_to_demote) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_stop.nodestack.ha, cliha=node_to_stop.clientstack.ha) looper.add(restarted_node) txnPoolNodeSet[1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, check_primaries=False) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1, 1) def get_current_bls_keys(node): return node.master_replica._bls_bft_replica._bls_bft.bls_key_register._current_bls_keys assert get_current_bls_keys(restarted_node) == get_current_bls_keys(primary_node)
def test_handle_delayed_preprepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown, monkeypatch): """ Make a node send PREPREPARE again after the slow node has ordered """ slow_node, other_nodes, primary_node, other_non_primary_nodes = \ split_nodes(txnPoolNodeSet) # This node will send PRE-PREPARE again orig_method = primary_node.master_replica._message_req_service.handlers[ PREPREPARE].process_message_req handler = primary_node.master_replica._message_req_service.handlers[ PREPREPARE] last_pp = None def patched_method(self, msg): nonlocal last_pp last_pp = orig_method(msg) return last_pp handler.process_message_req = types.MethodType(patched_method, handler) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) waitNodeDataEquality(looper, slow_node, *other_nodes) slow_master_replica = slow_node.master_replica count_pr_req = get_count( slow_master_replica._message_req_service, slow_master_replica._message_req_service.process_message_rep) primary_node.sendToNodes(MessageRep( **{ f.MSG_TYPE.nm: PREPREPARE, f.PARAMS.nm: { f.INST_ID.nm: last_pp.instId, f.VIEW_NO.nm: last_pp.viewNo, f.PP_SEQ_NO.nm: last_pp.ppSeqNo }, f.MSG.nm: last_pp }), names=[ slow_node.name, ]) def chk(): assert get_count( slow_master_replica._message_req_service, slow_master_replica. _message_req_service.process_message_rep) > count_pr_req assert slow_master_replica._ordering_service.spylog.getLast( "_validate").result[0] == -1 looper.run(eventually(chk, retryWait=1))
def test_view_change_done_delayed(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ A node is slow so is behind other nodes, after view change, it catches up but it also gets view change message as delayed, a node should start participating only when caught up and ViewChangeCone quorum received. """ nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] slow_node = nprs[-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 10 delay_vcd = 25 delay_3pc_messages([slow_node], 0, delay_3pc) slow_node.nodeIbStasher.delay(vcd_delay(delay_vcd)) def chk(node): assert node.view_changer.has_acceptable_view_change_quorum assert node.view_changer._primary_verified assert node.isParticipating assert None not in {r.isPrimary for r in node.replicas.values()} sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5 * 4, 4) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the slow node successfully completes catchup waitNodeDataEquality(looper, slow_node, *other_nodes) # Other nodes complete view change, select primary and participate for node in other_nodes: looper.run(eventually(chk, node, retryWait=1)) # Since `ViewChangeCone` is delayed, slow_node is not able to select primary # and participate assert not slow_node.view_changer.has_acceptable_view_change_quorum assert not slow_node.view_changer._primary_verified assert not slow_node.isParticipating assert {r.isPrimary for r in slow_node.replicas.values()} == {None} # Send requests to make sure pool is functional sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Repair network slow_node.reset_delays_and_process_delayeds() # `slow_node` selects primary and participate looper.run(eventually(chk, slow_node, retryWait=1)) # Processes requests received during lack of primary waitNodeDataEquality(looper, slow_node, *other_nodes) # Send more requests and compare data of all nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepare_malicious(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, malicious_setup, teardown): """ A node has bad network with primary and thus loses PRE-PREPARE, it requests PRE-PREPARE from non-primaries once it has sufficient PREPAREs but one of the non-primary does not send the PRE-PREPARE """ # primary_node = get_master_primary_node(txnPoolNodeSet) # slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node # other_nodes = [n for n in txnPoolNodeSet if n != slow_node] # bad_node = [n for n in other_nodes if n != primary_node][0] # good_non_primary_node = [n for n in other_nodes if n != slow_node # and n != bad_node and n != primary_node][0] primary_node, bad_node, good_non_primary_node, slow_node, other_nodes, \ bad_method, orig_method = malicious_setup slow_node.nodeIbStasher.delay(ppDelay(300, 0)) def get_reply_count_frm(node): return sum([ 1 for entry in slow_node.spylog.getAll(slow_node.process_message_rep) if entry.params['msg'].msg_type == PREPREPARE and entry.params['frm'] == node.name ]) old_reply_count_from_bad_node = get_reply_count_frm(bad_node) old_reply_count_from_good_node = get_reply_count_frm(good_non_primary_node) old_discarded = countDiscarded(slow_node.master_replica, 'does not have ' 'expected state') sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=2) waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list( [n.master_replica.ordered for n in txnPoolNodeSet]) assert not slow_node.master_replica.requested_pre_prepares if bad_method.__name__ == 'do_not_send': assert get_reply_count_frm(bad_node) == old_reply_count_from_bad_node else: assert countDiscarded(slow_node.master_replica, 'does not have expected state') > old_discarded assert get_reply_count_frm(good_non_primary_node) > \ old_reply_count_from_good_node slow_node.reset_delays_and_process_delayeds() bad_node.nodeMsgRouter.routes[MessageReq] = orig_method
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].ledger_summary slow_nodes = [r.node for r in getNonPrimaryReplicas( txnPoolNodeSet, 0)[-2:]] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [n.master_replica.last_prepared_certificate_in_view() for n in nodes] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def test_discard_3PC_messages_for_already_ordered(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Nodes discard any 3PC messages for already ordered 3PC keys (view_no, pp_seq_no). Delay all 3PC messages to a node so it cannot respond to them unless the other nodes order them, now when the slow node will get them it will respond but other nodes will not process them and discard them """ slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 20 delay_3pc_messages([slow_node], 0, delay) delay_3pc_messages([slow_node], 1, delay) sent_batches = 3 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=2 * sent_batches, num_batches=sent_batches) # send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, # 2 * sent_batches, sent_batches) def chk(node, inst_id, p_count, c_count): # A node will still record PREPRAREs even if more than n-f-1, till the # request is not ordered assert len(node.replicas[inst_id].prepares) >= p_count assert len(node.replicas[inst_id].commits) == c_count def count_discarded(inst_id, count): for node in other_nodes: assert countDiscarded(node.replicas[inst_id], 'already ordered 3 phase message') == count # `slow_node` did not receive any PREPAREs or COMMITs chk(slow_node, 0, 0, 0) # `other_nodes` have not discarded any 3PC message count_discarded(0, 0) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) # `slow_node` did receive correct number of PREPAREs and COMMITs looper.run(eventually(chk, slow_node, 0, sent_batches - 1, sent_batches, retryWait=1)) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) # `other_nodes` have discarded PREPAREs and COMMITs all batches count_discarded(0, 2 * sent_batches)
def sdk_ensure_pool_functional(looper, nodes, sdk_wallet, sdk_pool, num_reqs=10, num_batches=2): sdk_send_batches_of_random_and_check(looper, nodes, sdk_pool, sdk_wallet, num_reqs, num_batches) ensure_all_nodes_have_same_data(looper, nodes)
def sdk_ensure_pool_functional(looper, nodes, sdk_wallet, sdk_pool, num_reqs=10, num_batches=2): sdk_send_batches_of_random_and_check(looper, nodes, sdk_pool, sdk_wallet, num_reqs, num_batches) ensure_all_nodes_have_same_data(looper, nodes)
def test_discard_3PC_messages_for_already_ordered(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Nodes discard any 3PC messages for already ordered 3PC keys (view_no, pp_seq_no). Delay all 3PC messages to a node so it cannot respond to them unless the other nodes order them, now when the slow node will get them it will respond but other nodes will not process them and discard them """ slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 20 delay_3pc_messages([slow_node], 0, delay) delay_3pc_messages([slow_node], 1, delay) sent_batches = 3 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=2*sent_batches, num_batches=sent_batches) # send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, # 2 * sent_batches, sent_batches) def chk(node, inst_id, p_count, c_count): # A node will still record PREPRAREs even if more than n-f-1, till the # request is not ordered assert len(node.replicas[inst_id].prepares) >= p_count assert len(node.replicas[inst_id].commits) == c_count def count_discarded(inst_id, count): for node in other_nodes: assert countDiscarded(node.replicas[inst_id], 'already ordered 3 phase message') == count # `slow_node` did not receive any PREPAREs or COMMITs chk(slow_node, 0, 0, 0) # `other_nodes` have not discarded any 3PC message count_discarded(0, 0) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) # `slow_node` did receive correct number of PREPAREs and COMMITs looper.run(eventually(chk, slow_node, 0, sent_batches - 1, sent_batches, retryWait=1)) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) # `other_nodes` have discarded PREPAREs and COMMITs all batches count_discarded(0, 2 * sent_batches)
def test_view_change_done_delayed(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ A node is slow so is behind other nodes, after view change, it catches up but it also gets view change message as delayed, a node should start participating only when caught up and ViewChangeCone quorum received. """ nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] slow_node = nprs[-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 10 delay_vcd = 25 delay_3pc_messages([slow_node], 0, delay_3pc) slow_node.nodeIbStasher.delay(vcd_delay(delay_vcd)) def chk(node): assert node.view_changer.has_acceptable_view_change_quorum assert node.view_changer._primary_verified assert node.isParticipating assert None not in {r.isPrimary for r in node.replicas.values()} sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5 * 4, 4) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the slow node successfully completes catchup waitNodeDataEquality(looper, slow_node, *other_nodes) # Other nodes complete view change, select primary and participate for node in other_nodes: looper.run(eventually(chk, node, retryWait=1)) # Since `ViewChangeCone` is delayed, slow_node is not able to select primary # and participate assert not slow_node.view_changer.has_acceptable_view_change_quorum assert not slow_node.view_changer._primary_verified assert not slow_node.isParticipating assert {r.isPrimary for r in slow_node.replicas.values()} == {None} # Send requests to make sure pool is functional sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Repair network slow_node.reset_delays_and_process_delayeds() # `slow_node` selects primary and participate looper.run(eventually(chk, slow_node, retryWait=1)) # Processes requests received during lack of primary waitNodeDataEquality(looper, slow_node, *other_nodes) # Send more requests and compare data of all nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_request_preprepare(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Node requests PRE-PREPARE only once after getting PREPAREs. """ slow_node, other_nodes, primary_node, \ other_primary_nodes = split_nodes(txnPoolNodeSet) # Drop PrePrepares and Prepares slow_node.nodeIbStasher.delay(ppDelay(300, 0)) slow_node.nodeIbStasher.delay(pDelay(300, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) slow_node.nodeIbStasher.drop_delayeds() slow_node.nodeIbStasher.resetDelays() old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node) def chk(increase=True): # Method is called assert count_requested_preprepare_req(slow_node) > old_count_req # Requesting Preprepare assert count_requested_preprepare_resp(slow_node) - old_count_resp == ( 1 if increase else 0) for pp in primary_node.master_replica._ordering_service.sent_preprepares.values( ): for rep in [n.master_replica for n in other_primary_nodes]: prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash, pp.auditTxnRootHash) rep.send(prepare) looper.run(eventually(chk, True, retryWait=1)) old_count_resp = count_requested_preprepare_resp(slow_node) prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash, pp.auditTxnRootHash) rep.send(prepare) looper.run(eventually(chk, False, retryWait=1)) old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node)
def test_node_requests_missing_preprepare(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ A node has bad network with primary and thus loses PRE-PREPARE, it requests PRE-PREPARE from primary once it has sufficient PREPAREs """ slow_node, other_nodes, primary_node, other_non_primary_nodes = split_nodes( txnPoolNodeSet) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) old_count_pp = get_count( slow_node.master_replica, slow_node.master_replica._ordering_service.process_preprepare) old_count_mrq = { n.name: get_count(n, n.process_message_req) for n in other_nodes } old_count_mrp = get_count(slow_node, slow_node.process_message_rep) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=15, num_batches=5) waitNodeDataEquality(looper, slow_node, *other_nodes) assert not slow_node.master_replica.requested_pre_prepares # `slow_node` processed PRE-PREPARE # assert get_count(slow_node.master_replica, # slow_node.master_replica._ordering_service.process_preprepare) > old_count_pp # `slow_node` did receive `MessageRep` assert get_count(slow_node, slow_node.process_message_rep) > old_count_mrp # Primary node should received `MessageReq` and other nodes shouldn't recv_reqs = set() for n in other_non_primary_nodes: if get_count(n, n.process_message_req) > old_count_mrq[n.name]: recv_reqs.add(n.name) assert get_count(primary_node, primary_node.process_message_req) > \ old_count_mrq[primary_node.name] assert len(recv_reqs) == 0 # All nodes including the `slow_node` ordered the same requests assert check_if_all_equal_in_list( [n.master_replica._ordering_service.ordered for n in txnPoolNodeSet])
def test_node_requests_missing_preprepare(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ A node has bad network with primary and thus loses PRE-PREPARE, it requests PRE-PREPARE from primary once it has sufficient PREPAREs """ slow_node, other_nodes, primary_node, other_non_primary_nodes = split_nodes( txnPoolNodeSet) # Delay PRE-PREPAREs by large amount simulating loss slow_node.nodeIbStasher.delay(ppDelay(300, 0)) old_count_pp = get_count(slow_node.master_replica, slow_node.master_replica.processPrePrepare) old_count_mrq = {n.name: get_count(n, n.process_message_req) for n in other_nodes} old_count_mrp = get_count(slow_node, slow_node.process_message_rep) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=15, num_batches=5) waitNodeDataEquality(looper, slow_node, *other_nodes) assert not slow_node.master_replica.requested_pre_prepares # `slow_node` processed PRE-PREPARE assert get_count(slow_node.master_replica, slow_node.master_replica.processPrePrepare) > old_count_pp # `slow_node` did receive `MessageRep` assert get_count(slow_node, slow_node.process_message_rep) > old_count_mrp # Primary node should received `MessageReq` and other nodes shouldn't recv_reqs = set() for n in other_non_primary_nodes: if get_count(n, n.process_message_req) > old_count_mrq[n.name]: recv_reqs.add(n.name) assert get_count(primary_node, primary_node.process_message_req) > \ old_count_mrq[primary_node.name] assert len(recv_reqs) == 0 # All nodes including the `slow_node` ordered the same requests assert check_if_all_equal_in_list([n.master_replica.ordered for n in txnPoolNodeSet])
def test_non_primary_recvs_3phase_message_outside_watermarks(chkFreqPatched, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_logsize): """ A node is slow in processing PRE-PREPAREs and PREPAREs such that lot of requests happen and the slow node has started getting 3 phase messages outside of it watermarks. Check that it queues up requests outside watermarks and once it has received stable checkpoint it processes more requests. It sends other nodes 3 phase messages older than their stable checkpoint so they should discard them. """ delay = 15 instId = 1 reqsToSend = reqs_for_logsize + 2 npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) slowReplica = npr[0] slowNode = slowReplica.node slowNode.nodeIbStasher.delay(ppDelay(delay, instId)) slowNode.nodeIbStasher.delay(pDelay(delay, instId)) slowReplica.H = 5 def discardCounts(replicas, pat): counts = {} for r in replicas: counts[r.name] = countDiscarded(r, pat) return counts oldStashCount = slowReplica.spylog.count(TestReplica.stashOutsideWatermarks.__name__) oldDiscardCounts = discardCounts([n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * 7, num_batches=7) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqsToSend) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run(eventually(checkNodeDataForEquality, slowNode, *[n for n in txnPoolNodeSet if n != slowNode], retryWait=1, timeout=timeout)) newStashCount = slowReplica.spylog.count(TestReplica.stashOutsideWatermarks.__name__) assert newStashCount > oldStashCount def chk(): counts = discardCounts([n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') for nm, count in counts.items(): assert count > oldDiscardCounts[nm] timeout = waits.expectedNodeToNodeMessageDeliveryTime() * len(txnPoolNodeSet) + delay looper.run(eventually(chk, retryWait=1, timeout=timeout))
def test_replica_clear_collections_after_view_change(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, sdk_wallet_steward, chkFreqPatched, reqs_for_checkpoint): """ 1. Delay commits on one instance. 2. Order a transaction on the master. 3. Do View Change. 4. Send 2 batches for finalize checkpoint and cleaning requests queues. (1 batch is sent automatically to propagate primaries) 5. Check that requests from node contains all items from requestsQueue. """ stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=1)): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) # + 1 because of lastPrePrepareSeqNo was not dropped after view_change sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=reqs_for_checkpoint + 1) def check_request_queues(): assert len(txnPoolNodeSet[0].requests) == 1 for n in txnPoolNodeSet: assert len(n.replicas[1]._ordering_service.requestQueues[DOMAIN_LEDGER_ID]) == 0 looper.run(eventually(check_request_queues))
def test_replica_removing_with_backup_degraded(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward, tconf, tdir, allPluginsPath): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas view_no = txnPoolNodeSet[0].viewNo instance_to_remove = 1 stashers = [node.nodeIbStasher for node in txnPoolNodeSet] with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=instance_to_remove)): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) # check that replicas were removed def check_replica_removed_on_all_nodes(inst_id=instance_to_remove): for n in txnPoolNodeSet: check_replica_removed(n, start_replicas_count, inst_id) assert not n.monitor.isMasterDegraded() looper.run(eventually(check_replica_removed_on_all_nodes, timeout=120)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=view_no + 1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert all(start_replicas_count == node.replicas.num_replicas for node in txnPoolNodeSet)
def test_freeing_forwarded_not_preprepared_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): behind_node = txnPoolNodeSet[-1] behind_node.requests.clear() sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules(behind_node.nodeIbStasher, ppDelay(delay=sys.maxsize), pDelay(delay=sys.maxsize), cDelay(delay=sys.maxsize)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) # We execute caughtup requests assert len(behind_node.requests) == req_num assert all(r.executed for r in behind_node.requests.values() if behind_node.seqNoDB.get(r.request.key)[1])
def do_test_replica_removing_with_backup_degraded(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas view_no = txnPoolNodeSet[0].viewNo instance_to_remove = 1 stashers = [node.nodeIbStasher for node in txnPoolNodeSet] with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=instance_to_remove)): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) # check that replicas were removed def check_replica_removed_on_all_nodes(inst_id=instance_to_remove): for n in txnPoolNodeSet: check_replica_removed(n, start_replicas_count, inst_id) assert not n.monitor.isMasterDegraded() looper.run(eventually(check_replica_removed_on_all_nodes, timeout=120)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=view_no + 1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert all(start_replicas_count == node.replicas.num_replicas for node in txnPoolNodeSet)
def test_1_node_got_no_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_node = txnPoolNodeSet[-1] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_node.master_last_ordered_3PC == \ master_node.master_last_ordered_3PC # Emulate connection problems, behind_node doesnt receive pre-prepares router_dont_accept_messages_from(behind_node, master_node.name) # Send some txns and behind_node cant order them while pool is working sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) with pytest.raises(AssertionError): nodes_last_ordered_equal(behind_node, master_node) # behind_node has requested preprepare and wouldn't request it again. # It will catchup with closest stable checkpoint # Remove connection problems reset_router_accepting(behind_node) # Send txns sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # behind_node stashing new 3pc messages and not ordering and not participating in consensus assert len(behind_node.master_replica._ordering_service. prePreparesPendingPrevPP) == 1 with pytest.raises(AssertionError): nodes_last_ordered_equal(behind_node, master_node) # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, behind_node, master_node))
def test_1_node_got_no_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_node = txnPoolNodeSet[-1] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_node.master_last_ordered_3PC == \ master_node.master_last_ordered_3PC # Emulate connection problems, behind_node doesnt receive pre-prepares router_dont_accept_messages_from(behind_node, master_node.name) # Send some txns and behind_node cant order them while pool is working sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) with pytest.raises(AssertionError): nodes_last_ordered_equal(behind_node, master_node) # behind_node has requested preprepare and wouldn't request it again. # It will catchup with closest stable checkpoint # Remove connection problems reset_router_accepting(behind_node) # Send txns sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # behind_node stashing new 3pc messages and not ordering and not participating in consensus assert len(behind_node.master_replica.prePreparesPendingPrevPP) == 1 with pytest.raises(AssertionError): nodes_last_ordered_equal(behind_node, master_node) # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, behind_node, master_node))
def test_replica_clear_collections_after_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, sdk_wallet_steward, chkFreqPatched): """ 1. Delay commits on one instance. 2. Order a transaction on the master. 3. Do View Change. 4. Send 2 batches for finalize checkpoint and cleaning requests queues. 5. Check that requests from node contains all items from requestsQueue. """ stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=1)): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * CHK_FREQ, num_batches=2) def check_request_queues(): assert len(txnPoolNodeSet[0].requests) == 0 for reqs in txnPoolNodeSet[0].replicas[1].requestQueues.values(): assert all(req in txnPoolNodeSet[0].requests for req in reqs) looper.run(eventually(check_request_queues))
def test_clearing_forwarded_preprepared_request( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): # Case when backup ordered correctly, but primary had problems. # As a result, master will execute caughtup txns and will be removed # from requests queues behind_node = txnPoolNodeSet[-1] sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, CHK_FREQ, CHK_FREQ) with delay_rules(behind_node.nodeIbStasher, pDelay(delay=sys.maxsize, instId=0), cDelay(delay=sys.maxsize, instId=0)): count = behind_node.spylog.count(behind_node.allLedgersCaughtUp) sdk_send_batches_of_random(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, req_num, req_num) looper.run(eventually(node_caughtup, behind_node, count, retryWait=1)) assert len(behind_node.requests) == 0 assert all([len(q) == 0 for r in behind_node.replicas.values() for q in r.requestQueues.values()]) assert len(behind_node.clientAuthNr._verified_reqs) == 0 assert len(behind_node.requestSender) == 0
def test_watermarks_restored_after_stable( looper, chkFreqPatched, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A backup replica doesn't participate in consensus, and hence doesn't update watermarks. Then if it gets a quorum of stashed checkpoints (in fact Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 checkpoints where we have a quorum of n-f-1 for each), then it updates the watermarks, and starts to participate in consensus. """ # 1. patch backup replica on a non-primary node so that it doesn't participate # in consensus, so that watermarks are not updated on it. broken_replica, non_broken_replica = break_backup_replica(txnPoolNodeSet) # 2. send the number of requests which is less than Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 # quorumed checkpoints, # but sufficient for one watermark change (on a non-broken replica). sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * 9, num_batches=9) assert broken_replica.last_ordered_3pc == (0, 0) assert broken_replica.h == 0 assert broken_replica.H == 20 assert non_broken_replica.last_ordered_3pc == (0, 9) assert non_broken_replica.h == 5 assert non_broken_replica.H == 25 # 3. send requests to reach Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 # quorumed checkpoints. # The broken replica should adjust last_ordered_3pc and shift watermarks. sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1, num_batches=1) assert broken_replica.last_ordered_3pc == (0, 10) assert broken_replica.h == 10 assert broken_replica.H == 30 assert non_broken_replica.last_ordered_3pc == (0, 10) assert non_broken_replica.h == 10 assert non_broken_replica.H == 30 # 4. Repair broken replica and make sure that it participates in consensus # (after watermarks were corrected). repair_broken_replica(broken_replica) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=7, num_batches=7) assert broken_replica.last_ordered_3pc == (0, 17) assert broken_replica.h == 15 assert broken_replica.H == 35 assert non_broken_replica.last_ordered_3pc == (0, 17) assert non_broken_replica.h == 15 assert non_broken_replica.H == 35
def test_1_node_get_only_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_node = txnPoolNodeSet[-1] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_node.master_last_ordered_3PC == \ master_node.master_last_ordered_3PC # Emulate connection problems, behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-1], behind_node.name) # Send some txns and behind_node cant order them while pool is working sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # assert behind_node.master_last_ordered_3PC[1] + num_of_batches == \ # master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-1]) # Send txns sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, behind_node, master_node))
def test_1_node_get_only_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_node = txnPoolNodeSet[-1] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_node.master_last_ordered_3PC == \ master_node.master_last_ordered_3PC # Emulate connection problems, behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-1], behind_node.name) # Send some txns and behind_node cant order them while pool is working sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # assert behind_node.master_last_ordered_3PC[1] + num_of_batches == \ # master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-1]) # Send txns sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, behind_node, master_node))
def test_checkpoint_across_views(sent_batches, chkFreqPatched, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Test checkpointing across views. This test checks that checkpointing and garbage collection works correctly no matter if view change happened before a checkpoint or after a checkpoint """ batch_size = chkFreqPatched.Max3PCBatchSize sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * sent_batches, sent_batches) # Check that correct garbage collection happens non_gced_batch_count = (sent_batches - CHK_FREQ) if sent_batches >= CHK_FREQ else sent_batches looper.run(eventually(checkRequestCounts, txnPoolNodeSet, batch_size * non_gced_batch_count, non_gced_batch_count, retryWait=1)) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # Check that after view change, proper clean up is done for node in txnPoolNodeSet: for r in node.replicas.values(): assert not r.checkpoints # No stashed checkpoint for previous view assert not [view_no for view_no in r.stashedRecvdCheckpoints if view_no < r.viewNo] assert r._h == 0 assert r._lastPrePrepareSeqNo == 0 assert r.h == 0 assert r.H == r._h + chkFreqPatched.LOG_SIZE checkRequestCounts(txnPoolNodeSet, 0, 0) # Even after view change, chekpointing works sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * sent_batches, sent_batches) looper.run(eventually(checkRequestCounts, txnPoolNodeSet, batch_size * non_gced_batch_count, non_gced_batch_count, retryWait=1)) # Send more batches so one more checkpoint happens. This is done so that # when this test finishes, all requests are garbage collected and the # next run of this test (with next param) has the calculations correct more = CHK_FREQ - non_gced_batch_count sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * more, more) looper.run(eventually(checkRequestCounts, txnPoolNodeSet, 0, 0, retryWait=1))
def test_2_nodes_get_only_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_nodes = txnPoolNodeSet[-2:] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) nodes_last_ordered_equal(*txnPoolNodeSet) # Emulate connection problems, 1st behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-2], behind_nodes[0].name) # Send some txns and 1st behind_node cant order them while pool is working sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_nodes[0].master_last_ordered_3PC[1] + num_of_batches == \ master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-2]) # Send txns sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # 1st behind_node is getting new prepares, but still can't order, # cause can't get quorum for prepare for previous batch assert behind_nodes[0].master_last_ordered_3PC[1] + num_of_batches * 2 == \ master_node.master_last_ordered_3PC[1] # Emulate connection problems, 2nd behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-2], behind_nodes[1].name) # Send some txns and 2nd behind_node cant order them while pool is working sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_nodes[1].master_last_ordered_3PC[1] + num_of_batches == \ master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-2]) # Send txns sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # 2nd behind_node is getting new prepares, but still can't order, # cause can't get quorum for prepare for previous batch assert behind_nodes[1].master_last_ordered_3PC[1] + num_of_batches * 2 == \ master_node.master_last_ordered_3PC[1] # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, *behind_nodes, master_node))
def test_limited_stash_3pc_while_catchup(tdir, tconf, looper, testNodeClass, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, chkFreqPatched): ''' Test that the lagging_node can process messages from catchup stash after catchup and request lost messages from other nodes. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Check that requests executed well sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Order 2 checkpoints on rest_nodes (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) initial_all_ledgers_caught_up = lagging_node.spylog.count( Node.allLedgersCaughtUp) with delay_rules(lagging_node.nodeIbStasher, cs_delay(), msg_rep_delay(types_to_delay=[PREPARE, PREPREPARE])): with delay_rules(lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID)): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) # Order 2 checkpoints in the first lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) # Check that firs txn was ordered from stash after first catchup looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC[ 1] == txnPoolNodeSet[0].master_last_ordered_3PC[1] - 1), retryWait=1, timeout=waits.expectedPoolCatchupTime( len(txnPoolNodeSet)))) # Order 2 checkpoints in the second lagging node catchup (2 txns in 2 batches) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * CHK_FREQ, 2) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=5) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(lambda: assertExp(lagging_node.master_last_ordered_3PC == n. master_last_ordered_3PC for n in txnPoolNodeSet))) # check that catch-up was started only twice assert lagging_node.spylog.count( Node.allLedgersCaughtUp) == initial_all_ledgers_caught_up + 2
def test_watermarks_restored_after_stable(looper, chkFreqPatched, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A backup replica doesn't participate in consensus, and hence doesn't update watermarks. Then if it gets a quorum of stashed checkpoints (in fact Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 checkpoints where we have a quorum of n-f-1 for each), then it updates the watermarks, and starts to participate in consensus. """ # 1. patch backup replica on a non-primary node so that it doesn't participate # in consensus, so that watermarks are not updated on it. broken_replica, non_broken_replica = break_backup_replica(txnPoolNodeSet) # 2. send the number of requests which is less than Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 # quorumed checkpoints, # but sufficient for one watermark change (on a non-broken replica). sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * 9, num_batches=9) assert broken_replica.last_ordered_3pc == (0, 0) assert broken_replica.h == 0 assert broken_replica.H == sys.maxsize assert non_broken_replica.last_ordered_3pc == (0, 9) assert non_broken_replica.h == 5 assert non_broken_replica.H == 25 # 3. send requests to reach Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 # quorumed checkpoints. # The broken replica should adjust last_ordered_3pc and shift watermarks. sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1, num_batches=1) assert broken_replica.last_ordered_3pc == (0, 10) assert broken_replica.h == 10 assert broken_replica.H == 30 assert non_broken_replica.last_ordered_3pc == (0, 10) assert non_broken_replica.h == 10 assert non_broken_replica.H == 30 # 4. Repair broken replica and make sure that it participates in consensus # (after watermarks were corrected). repair_broken_replica(broken_replica) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=7, num_batches=7) assert broken_replica.last_ordered_3pc == (0, 17) assert broken_replica.h == 15 assert broken_replica.H == 35 assert non_broken_replica.last_ordered_3pc == (0, 17) assert non_broken_replica.h == 15 assert non_broken_replica.H == 35
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) start_view_change start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) slow_node.nodeIbStasher.delay(cr_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[ 0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node stashed commits assert slow_node.master_replica.stasher.num_stashed_catchup == \ sent_batches * (len(txnPoolNodeSet) - 1) looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(CATCHUP_REP) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed messages successfully assert slow_node.master_replica.stasher.num_stashed_catchup == 0 looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_checkpoint_across_views(sent_batches, chkFreqPatched, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Test checkpointing across views. This test checks that checkpointing and garbage collection works correctly no matter if view change happened before a checkpoint or after a checkpoint """ batch_size = chkFreqPatched.Max3PCBatchSize sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * sent_batches, sent_batches) # Check that correct garbage collection happens non_gced_batch_count = ( sent_batches - CHK_FREQ) if sent_batches >= CHK_FREQ else sent_batches looper.run( eventually(checkRequestCounts, txnPoolNodeSet, batch_size * non_gced_batch_count, non_gced_batch_count, retryWait=1)) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # Check that after view change, proper clean up is done for node in txnPoolNodeSet: for r in node.replicas.values(): assert not r.checkpoints # No stashed checkpoint for previous view assert not [ view_no for view_no in r.stashedRecvdCheckpoints if view_no < r.viewNo ] assert r._h == 0 assert r._lastPrePrepareSeqNo == 0 assert r.h == 0 assert r.H == r._h + chkFreqPatched.LOG_SIZE checkRequestCounts(txnPoolNodeSet, 0, 0) # Even after view change, chekpointing works sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * sent_batches, sent_batches) looper.run( eventually(checkRequestCounts, txnPoolNodeSet, batch_size * non_gced_batch_count, non_gced_batch_count, retryWait=1)) # Send more batches so one more checkpoint happens. This is done so that # when this test finishes, all requests are garbage collected and the # next run of this test (with next param) has the calculations correct more = CHK_FREQ - non_gced_batch_count sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, batch_size * more, more) looper.run( eventually(checkRequestCounts, txnPoolNodeSet, 0, 0, retryWait=1))
def test_process_three_phase_msg_and_stashed_for_next_checkpoint( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, chkFreqPatched): """ 1. Delay checkpoints processing on the slow_node. That is checkpoint on this node can't be finalized. 2. Order requests for finalize checkpoints. 3. Check that a checkpoint is finalized on all nodes exclude the slow_node. 4. Order a new request. 5. Check that slow_node could not order this request and stashed all 3pc messages. 6. Reset delays. 7. Check that the last request is ordered on the slow_node, checkpoint is finalized and stashed messages were removed. """ for n in txnPoolNodeSet: for r in n.replicas.values(): r.update_watermark_from_3pc() slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] old_stashed = { inst_id: r.stasher.num_stashed_watermarks for inst_id, r in slow_node.replicas.items() } last_ordered = { inst_id: r.last_ordered_3pc for inst_id, r in slow_node.replicas.items() } with delay_rules([ slow_node.nodeIbStasher, ], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): with delay_rules([ slow_node.nodeIbStasher, ], chk_delay()): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=1 * CHK_FREQ, num_batches=CHK_FREQ) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) looper.run( eventually(_check_checkpoint_finalize, fast_nodes, 1, CHK_FREQ)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stashed_messages = incoming_3pc_msgs_count(len(txnPoolNodeSet)) assert all( r.stasher.num_stashed_watermarks == old_stashed[inst_id] + stashed_messages for inst_id, r in slow_node.replicas.items()) _check_batches_ordered(slow_node, last_ordered, CHK_FREQ) for n in fast_nodes: _check_batches_ordered(n, last_ordered, CHK_FREQ + 1) looper.run( eventually(_check_checkpoint_finalize, [ slow_node, ], 1, CHK_FREQ)) looper.run( eventually(_check_batches_ordered, slow_node, last_ordered, CHK_FREQ + 1)) assert all(r.stasher.num_stashed_watermarks == old_stashed[inst_id] for inst_id, r in slow_node.replicas.items())
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) startViewChange start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals(slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node orders all requests as others have assert last_ordered[0] == slow_node.master_last_ordered_3PC looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_2_nodes_get_only_preprepare(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, chkFreqPatched): master_node = txnPoolNodeSet[0] behind_nodes = txnPoolNodeSet[-2:] delta = tconf.CHK_FREQ * 3 num_of_batches = 1 # Nodes order batches sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) nodes_last_ordered_equal(*txnPoolNodeSet) # Emulate connection problems, 1st behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-2], behind_nodes[0].name) # Send some txns and 1st behind_node cant order them while pool is working sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_nodes[0].master_last_ordered_3PC[1] + num_of_batches == \ master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-2]) # Send txns sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # 1st behind_node is getting new prepares, but still can't order, # cause can't get quorum for prepare for previous batch assert behind_nodes[0].master_last_ordered_3PC[1] + num_of_batches * 2 == \ master_node.master_last_ordered_3PC[1] # Emulate connection problems, 2nd behind_node receiving only pre-prepares dont_send_prepare_and_commit_to(txnPoolNodeSet[:-2], behind_nodes[1].name) # Send some txns and 2nd behind_node cant order them while pool is working sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) assert behind_nodes[1].master_last_ordered_3PC[1] + num_of_batches == \ master_node.master_last_ordered_3PC[1] # Remove connection problems reset_sending(txnPoolNodeSet[:-2]) # Send txns sdk_send_batches_of_random_and_check( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_of_batches, num_of_batches) # 2nd behind_node is getting new prepares, but still can't order, # cause can't get quorum for prepare for previous batch assert behind_nodes[1].master_last_ordered_3PC[1] + num_of_batches * 2 == \ master_node.master_last_ordered_3PC[1] # After achieving stable checkpoint, behind_node start ordering sdk_send_batches_of_random( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, delta, delta) # Pool is working looper.run(eventually(nodes_last_ordered_equal, *behind_nodes, master_node))
def test_node_request_preprepare(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, teardown): """ Node requests PRE-PREPARE only once after getting PREPAREs. """ slow_node, other_nodes, primary_node, \ other_primary_nodes = split_nodes(txnPoolNodeSet) # Drop PrePrepares and Prepares slow_node.nodeIbStasher.delay(ppDelay(300, 0)) slow_node.nodeIbStasher.delay(pDelay(300, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) slow_node.nodeIbStasher.drop_delayeds() slow_node.nodeIbStasher.resetDelays() old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node) def chk(increase=True): # Method is called assert count_requested_preprepare_req(slow_node) > old_count_req # Requesting Preprepare assert count_requested_preprepare_resp( slow_node) - old_count_resp == (1 if increase else 0) for pp in primary_node.master_replica.sentPrePrepares.values(): for rep in [n.master_replica for n in other_primary_nodes]: prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash ) rep.send(prepare) looper.run(eventually(chk, True, retryWait=1)) old_count_resp = count_requested_preprepare_resp(slow_node) prepare = Prepare(rep.instId, pp.viewNo, pp.ppSeqNo, pp.ppTime, pp.digest, pp.stateRootHash, pp.txnRootHash ) rep.send(prepare) looper.run(eventually(chk, False, retryWait=1)) old_count_req = count_requested_preprepare_req(slow_node) old_count_resp = count_requested_preprepare_resp(slow_node)