def test_non_primary_accepts_pre_prepare_time(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of the non-primary has an in-correct clock so it thinks PRE-PREPARE has incorrect time """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) # send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2) # The replica having the bad clock confused_npr = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1] make_clock_faulty(confused_npr.node) old_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) old_susp_count = get_timestamp_suspicion_count(confused_npr.node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) assert get_timestamp_suspicion_count(confused_npr.node) > old_susp_count new_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) # `is_pre_prepare_time_acceptable` first returned False then returned True assert [True, False, *old_acceptable_rvs] == new_acceptable_rvs
def test_non_primary_accepts_pre_prepare_time(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of the non-primary has an in-correct clock so it thinks PRE-PREPARE has incorrect time """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) # send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2) # The replica having the bad clock confused_npr = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1] make_clock_faulty(confused_npr.node) old_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) old_susp_count = get_timestamp_suspicion_count(confused_npr.node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) assert get_timestamp_suspicion_count(confused_npr.node) > old_susp_count new_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) # `is_pre_prepare_time_acceptable` first returned False then returned True assert [True, False, *old_acceptable_rvs] == new_acceptable_rvs
def test_new_node_accepts_timestamp(tconf, looper, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, client1, wallet1, client1Connected): """ A new node joins the pool and is able to function properly without """ _, new_node, _, _, _, _ = nodeSetWithNodeAddedAfterSomeTxns old_susp_count = get_timestamp_suspicion_count(new_node) # Don't wait for node to catchup, start sending requests sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 10) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count # All nodes should reply send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, Max3PCBatchSize * 3) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count suspicions = { node.name: get_timestamp_suspicion_count(node) for node in txnPoolNodeSet } ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, Max3PCBatchSize * 3) for node in txnPoolNodeSet: assert suspicions[node.name] == get_timestamp_suspicion_count(node)
def test_pp_obsolescence_check_fail_for_delayed(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): delay = PATCHED_ACCEPTABLE_DEVIATION_PREPREPARE_SECS + 1 lagging_node = txnPoolNodeSet[-1] # Prevent lagging node from ordering with delay_rules(lagging_node.nodeIbStasher, ppDelay(), pDelay(), cDelay()): # Order request on all nodes except lagging one sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.run(asyncio.sleep(delay)) # Now delayed 3PC messages reach lagging node, so any delayed transactions # can be processed (PrePrepare would be discarded but requested after that), # ensure that all nodes will have same data after that ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) pp_count = get_count(lagging_node.master_replica, lagging_node.master_replica.processPrePrepare) assert pp_count > 0 assert get_timestamp_suspicion_count(lagging_node) == pp_count
def test_first_audit_catchup_during_ordering(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] other_stashers = [node.nodeIbStasher for node in other_nodes] def lagging_node_state() -> NodeLeecherService.State: return lagging_node.ledgerManager._node_leecher._state def check_lagging_node_is_not_syncing_audit(): assert lagging_node_state() != NodeLeecherService.State.SyncingAudit # Prevent lagging node from catching up domain ledger (and finishing catchup) with delay_rules(other_stashers, delay_domain_ledger_catchup()): # Start catchup on lagging node lagging_node.start_catchup() assert lagging_node_state() == NodeLeecherService.State.SyncingAudit # Ensure that audit ledger is caught up by lagging node looper.run(eventually(check_lagging_node_is_not_syncing_audit)) assert lagging_node_state() != NodeLeecherService.State.Idle # Order request on all nodes except lagging one where they goes to stashed state sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Now catchup should end and lagging node starts processing stashed PPs # and resumes ordering # ensure that all nodes will have same data after that ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # ensure that no suspicions about obsolete PP have been raised assert get_timestamp_suspicion_count(lagging_node) == 0
def chk(): for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name]
def test_nodes_with_bad_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ All nodes have bad clocks but they eventaully get repaired, an example of nodes being cut off from NTP server for some time or NTP sync disabled then without node restart NTP sync enabled """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) ledger_sizes = {node.name: node.domainLedger.size for node in txnPoolNodeSet} susp_counts = {node.name: get_timestamp_suspicion_count(node) for node in txnPoolNodeSet} for node in txnPoolNodeSet: make_clock_faulty( node, clock_slow_by_sec=node.config.ACCEPTABLE_DEVIATION_PREPREPARE_SECS + randint( 5, 15), ppr_always_wrong=False) for _ in range(5): sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) looper.runFor(.2) # Let some time pass looper.runFor(3) def chk(): for node in txnPoolNodeSet: # Each node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] looper.run(eventually(chk, retryWait=1)) # Fix clocks for node in txnPoolNodeSet: def utc_epoch(self) -> int: return get_utc_epoch() node.utc_epoch = types.MethodType(utc_epoch, node) # Let some more time pass looper.runFor(3) # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def test_new_primary_has_wrong_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of non-primary has a bad clock, it raises suspicions but orders requests after getting PREPAREs. Then a view change happens this non-primary with the bad clock becomes the new primary but is not able to get any of it's PRE-PREPAREs ordered. Eventually another view change happens and a new primary is elected the pool is functional again :return: """ # The node having the bad clock, this node will be primary after view # change faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0].node make_clock_faulty(faulty_node) assert not faulty_node.master_replica.isPrimary # faulty_node replies too ledger_sizes = { node.name: node.domainLedger.size for node in txnPoolNodeSet } susp_counts = { node.name: get_timestamp_suspicion_count(node) for node in txnPoolNodeSet } ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is primary. # But after it sent first pp, new view change happens assert txnPoolNodeSet[2].master_replica.isPrimary def chk(): for node in txnPoolNodeSet: assert node.viewNo == 2 for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # 1 view txn wasn't written assert any(txn[1]['txn']['data']['viewNo'] == 2 for txn in node.auditLedger.getAllTxn()) assert not any(txn[1]['txn']['data']['viewNo'] == 1 for txn in node.auditLedger.getAllTxn()) assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name] looper.run(eventually(chk, retryWait=1)) # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def chk(): for node in txnPoolNodeSet: assert node.viewNo == old_view_no for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name]
def test_new_node_accepts_timestamp(tconf, looper, txnPoolNodeSet, sdk_node_created_after_some_txns, sdk_wallet_client, sdk_pool_handle): """ A new node joins the pool and is able to function properly without """ _, new_node, _, _ = sdk_node_created_after_some_txns old_susp_count = get_timestamp_suspicion_count(new_node) # Don't wait for node to catchup, start sending requests sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=10) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count # All nodes should reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count suspicions = { node.name: get_timestamp_suspicion_count(node) for node in txnPoolNodeSet } ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) for node in txnPoolNodeSet: assert suspicions[node.name] == get_timestamp_suspicion_count(node)
def test_new_node_accepts_timestamp(tconf, looper, txnPoolNodeSet, sdk_node_created_after_some_txns, sdk_wallet_client, sdk_pool_handle): """ A new node joins the pool and is able to function properly without """ _, new_node, _, _ = sdk_node_created_after_some_txns old_susp_count = get_timestamp_suspicion_count(new_node) # Don't wait for node to catchup, start sending requests sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=10) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count # All nodes should reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) # No suspicions were raised by new_node assert get_timestamp_suspicion_count(new_node) == old_susp_count suspicions = {node.name: get_timestamp_suspicion_count( node) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) for node in txnPoolNodeSet: assert suspicions[node.name] == get_timestamp_suspicion_count(node)
def chk(): for node in txnPoolNodeSet: assert node.viewNo == 2 for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # 1 view txn wasn't written assert any(txn[1]['txn']['data']['viewNo'] == 2 for txn in node.auditLedger.getAllTxn()) assert not any(txn[1]['txn']['data']['viewNo'] == 1 for txn in node.auditLedger.getAllTxn()) assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name]
def test_stashed_pp_pass_obsolescence_check(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): lagging_node = txnPoolNodeSet[-1] def lagging_node_state() -> NodeLeecherService.State: return lagging_node.ledgerManager._node_leecher._state # TODO INDY-2047: fills domain ledger with some requests # as a workaround for the issue sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # Prevent lagging node from catching up domain ledger (and finishing catchup) with delay_rules(lagging_node.nodeIbStasher, delay_audit_ledger_catchup()): # Start catchup on lagging node lagging_node.ledgerManager.start_catchup() assert lagging_node_state() == NodeLeecherService.State.SyncingAudit # Order request on all nodes except lagging one where they goes to stashed state sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # lagging node is still syncing Audit ledger assert lagging_node_state() == NodeLeecherService.State.SyncingAudit # delay catchup end to exceed PP ACCEPTABLE_DEVIATION_PREPREPARE_SECS looper.runFor(PATCHED_ACCEPTABLE_DEVIATION_PREPREPARE_SECS + 1) # Now catchup should end and lagging node starts processing stashed PPs # and resumes ordering # ensure that all nodes will have same data after that ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # ensure that no suspicions about obsolete PP have been raised assert get_timestamp_suspicion_count(lagging_node) == 0
def test_new_primary_has_wrong_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of non-primary has a bad clock, it raises suspicions but orders requests after getting PREPAREs. Then a view change happens this non-primary with the bad clock becomes the new primary but is not able to get any of it's PRE-PREPAREs ordered. Eventually another view change happens and a new primary is elected the pool is functional again :return: """ # The node having the bad clock, this node will be primary after view # change faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0].node make_clock_faulty(faulty_node) assert not faulty_node.master_replica.isPrimary # faulty_node replies too sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 3) ledger_sizes = { node.name: node.domainLedger.size for node in txnPoolNodeSet} susp_counts = {node.name: get_timestamp_suspicion_count( node) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is primary assert faulty_node.master_replica.isPrimary old_view_no = txnPoolNodeSet[0].viewNo # Delay instance change so view change doesn't happen in the middle of this test stashers = (n.nodeIbStasher for n in txnPoolNodeSet) with delay_rules(stashers, icDelay()): # Requests are sent for _ in range(5): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, count=2) looper.runFor(2) def chk(): for node in txnPoolNodeSet: assert node.viewNo == old_view_no for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name] looper.run(eventually(chk, retryWait=1)) # Eventually another view change happens looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, retryWait=1, timeout=2 * tconf.PerfCheckFreq)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is no more the primary assert not faulty_node.master_replica.isPrimary # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def test_new_primary_has_wrong_clock(tconf, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of non-primary has a bad clock, it raises suspicions but orders requests after getting PREPAREs. Then a view change happens this non-primary with the bad clock becomes the new primary but is not able to get any of it's PRE-PREPAREs ordered. Eventually another view change happens and a new primary is elected the pool is functional again :return: """ # The node having the bad clock, this node will be primary after view # change faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0].node make_clock_faulty(faulty_node) assert not faulty_node.master_replica.isPrimary # faulty_node replies too ledger_sizes = { node.name: node.domainLedger.size for node in txnPoolNodeSet} susp_counts = {node.name: get_timestamp_suspicion_count( node) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is primary assert faulty_node.master_replica.isPrimary old_view_no = txnPoolNodeSet[0].viewNo # Delay instance change so view change doesn't happen in the middle of this test stashers = (n.nodeIbStasher for n in txnPoolNodeSet) with delay_rules(stashers, icDelay()): # Requests are sent for _ in range(5): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, count=2) looper.runFor(2) def chk(): for node in txnPoolNodeSet: assert node.viewNo == old_view_no for node in [n for n in txnPoolNodeSet if n != faulty_node]: # Each non faulty node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name] assert faulty_node.domainLedger.size == ledger_sizes[faulty_node.name] looper.run(eventually(chk, retryWait=1)) # Eventually another view change happens ensure_view_change(looper, txnPoolNodeSet) looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, retryWait=1, timeout=2 * tconf.PerfCheckFreq)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # After view change, faulty_node is no more the primary assert not faulty_node.master_replica.isPrimary # All nodes reply sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=Max3PCBatchSize * 2)
def chk(): for node in txnPoolNodeSet: # Each node raises suspicion assert get_timestamp_suspicion_count(node) > susp_counts[node.name] # Ledger does not change assert node.domainLedger.size == ledger_sizes[node.name]