def test_non_primary_accepts_pre_prepare_time(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of the non-primary has an in-correct clock so it thinks PRE-PREPARE has incorrect time """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) # send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2) # The replica having the bad clock confused_npr = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1] make_clock_faulty(confused_npr.node) old_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) old_susp_count = get_timestamp_suspicion_count(confused_npr.node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) assert get_timestamp_suspicion_count(confused_npr.node) > old_susp_count new_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) # `is_pre_prepare_time_acceptable` first returned False then returned True assert [True, False, *old_acceptable_rvs] == new_acceptable_rvs
def test_view_change_timeout(nodeSet, looper, up, wallet1, client1): """ Check view change restarted if it is not completed in time """ m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) initial_view_no = waitForViewChange(looper, nodeSet) # Setting view change timeout to low value to make test pass quicker for node in nodeSet: node._view_change_timeout = 5 # Delaying view change messages to make first view change fail # due to timeout for node in nodeSet: node.nodeIbStasher.delay(vcd_delay(delay=50)) # Delaying preprepae messages from nodes and # sending request to force view change #for i in range(3): # delayNonPrimaries(nodeSet, instId=i, delay=10) #sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) times = {} for node in nodeSet: times[node.name] = { 'called': get_count(node, node._check_view_change_completed), 'returned_true': len(getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) } for node in nodeSet: node.startViewChange(initial_view_no + 1) # First view change should fail, because of delayed # instance change messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=nodeSet, customTimeout=10) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(nodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=nodeSet) ensure_all_nodes_have_same_data(looper, nodes=nodeSet) new_m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) assert m_primary_node.name != new_m_primary_node.name # The timeout method has been called at least once for node in nodeSet: assert get_count(node, node._check_view_change_completed) > times[node.name]['called'] assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) > times[node.name]['returned_true'] # Multiple view changes have been initiated for node in nodeSet: assert (node.viewNo - initial_view_no) > 1 ensure_pool_functional(looper, nodeSet, wallet1, client1)
def count_requested_preprepare_resp(node): # Returns the number of times PRE-PREPARE was requested sr = node.master_replica return len( getAllReturnVals(sr, sr._request_pre_prepare_if_possible, compare_val_to=True))
def count_requested_preprepare_resp(node): # Returns the number of times PRE-PREPARE was requested sr = node.master_replica return len( getAllReturnVals(sr._ordering_service, sr._ordering_service._request_pre_prepare_for_prepare, compare_val_to=True))
def newNodeCaughtUp(txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): looper, newNode, _, _, _, _ = nodeSetWithNodeAddedAfterSomeTxns waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:4]) check_last_3pc_master(newNode, txnPoolNodeSet[:4]) # Check if catchup done once catchup_done_once = True for li in newNode.ledgerManager.ledgerRegistry.values(): catchup_done_once = catchup_done_once and (li.num_txns_caught_up > 0) if not catchup_done_once: # It might be the case that node has to do catchup again, in that case # check the return value of `num_txns_caught_up_in_last_catchup` to be # greater than 0 assert max( getAllReturnVals( newNode, newNode.num_txns_caught_up_in_last_catchup)) > 0 for li in newNode.ledgerManager.ledgerRegistry.values(): assert not li.receivedCatchUpReplies assert not li.recvdCatchupRepliesFrm return newNode
def test_view_change_restarted_by_timeout_if_next_primary_disconnected( txnPoolNodeSet, looper, setup, wallet1, client1): """ Verifies that a view change is restarted by timeout if the next primary has been disconnected """ _, initial_view_no, timeout_callback_stats = setup start_view_change(txnPoolNodeSet, initial_view_no + 1) alive_nodes = stop_next_primary(txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=alive_nodes, numInstances=3) # There were 2 view changes for node in alive_nodes: assert (node.viewNo - initial_view_no) == 2 # The timeout method was called 1 time for node in alive_nodes: assert get_count(node, node._check_view_change_completed) - \ timeout_callback_stats[node.name]['called'] == 1 assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) - \ timeout_callback_stats[node.name]['returned_true'] == 1
def sdk_new_node_caught_up(txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): looper, new_node, _, _ = sdk_node_set_with_node_added_after_some_txns waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], exclude_from_check=['check_last_ordered_3pc_backup']) check_last_3pc_master(new_node, txnPoolNodeSet[:4]) # Check if catchup done once catchup_done_once = True for leecher in new_node.ledgerManager._node_leecher._leechers.values(): catchup_done_once = catchup_done_once and (leecher.num_txns_caught_up > 0) if not catchup_done_once: # It might be the case that node has to do catchup again, in that case # check the return value of `num_txns_caught_up_in_last_catchup` to be # greater than 0 assert max( getAllReturnVals(new_node, new_node.num_txns_caught_up_in_last_catchup)) > 0 return new_node
def sdk_new_node_caught_up(txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): looper, new_node, _, _ = sdk_node_set_with_node_added_after_some_txns waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4]) check_last_3pc_master(new_node, txnPoolNodeSet[:4]) # Check if catchup done once catchup_done_once = True for li in new_node.ledgerManager.ledgerRegistry.values(): catchup_done_once = catchup_done_once and (li.num_txns_caught_up > 0) if not catchup_done_once: # It might be the case that node has to do catchup again, in that case # check the return value of `num_txns_caught_up_in_last_catchup` to be # greater than 0 assert max( getAllReturnVals( new_node, new_node.num_txns_caught_up_in_last_catchup)) > 0 for li in new_node.ledgerManager.ledgerRegistry.values(): assert not li.receivedCatchUpReplies assert not li.recvdCatchupRepliesFrm return new_node
def test_view_change_timeout_next_primary_is_disconnected( nodeSet, looper, up, setup, wallet1, client1): """ Check view change restarted if it is not completed in time """ m_primary_node, initial_view_no, times = setup start_view_change(nodeSet, initial_view_no + 1) alive_nodes = stop_next_primary(nodeSet) ensureElectionsDone(looper=looper, nodes=alive_nodes, numInstances=3) # there were 2 view changes for node in alive_nodes: assert (node.viewNo - initial_view_no) == 2 # The timeout method has been called at least once for node in alive_nodes: assert get_count( node, node._check_view_change_completed) > times[node.name]['called'] assert len( getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) > times[node.name]['returned_true']
def get_last_returned_val(): rvs = [] for node in txnPoolNodeSet: rv = getAllReturnVals(node, node.getReplyFromLedger) rvs.append(rv[0]) # All items are same in the list assert rvs.count(rvs[0]) == len(txnPoolNodeSet) return rvs[0]
def get_last_returned_val(): rvs = [] for node in txnPoolNodeSet: rv = getAllReturnVals(node, node.getReplyFromLedgerForRequest) rvs.append(rv[0]) # All items are same in the list assert rvs.count(rvs[0]) == len(txnPoolNodeSet) return rvs[0]
def checkPrepareReqSent(replica: TestReplica, identifier: str, reqId: int): paramsList = getAllArgs(replica, replica.canSendPrepare) rv = getAllReturnVals(replica, replica.canSendPrepare) for params in paramsList: req = params['request'] assert req.identifier == identifier assert req.reqId == reqId assert all(rv)
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) looper.run(eventually(partial(checkViewNoForNodes, nodes, 1), retryWait=1, timeout=20))
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes)
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) # After node catches up it set view_no from audit ledger and do not need to do view_change assert len( getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer.start_view_change, compare_val_to=True)) == 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def checkPrepareReqSent(replica: TestReplica, key: str, view_no: int): paramsList = getAllArgs(replica, replica.canPrepare) rv = getAllReturnVals(replica, replica.canPrepare) args = [p["ppReq"].reqIdr for p in paramsList if p["ppReq"].viewNo == view_no] assert [key] in args idx = args.index([key]) assert rv[idx]
def check_timeout_callback_called(times): for node in txnPoolNodeSet: assert get_count(node, node._check_view_change_completed) - \ timeout_callback_stats[node.name]['called'] == times assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) - \ timeout_callback_stats[node.name]['returned_true'] == times
def checkPrepareReqSent(replica: TestReplica, key: str, view_no: int): paramsList = getAllArgs(replica, replica.canPrepare) rv = getAllReturnVals(replica, replica.canPrepare) args = [ p["ppReq"].reqIdr for p in paramsList if p["ppReq"].viewNo == view_no ] assert [key] in args idx = args.index([key]) assert rv[idx]
def test_node_request_propagates(looper, setup, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of node lacks sufficient propagates """ faulty_node, recv_client_requests = setup old_count_recv_ppg = get_count(faulty_node, faulty_node.processPropagate) old_count_recv_req = get_count(faulty_node, faulty_node.processRequest) old_count_request_propagates = get_count( faulty_node, faulty_node.request_propagates) def sum_of_sent_batches(): return faulty_node.replicas[0].lastPrePrepareSeqNo + \ faulty_node.replicas[1].lastPrePrepareSeqNo old_sum_of_sent_batches = sum_of_sent_batches() sent_reqs = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sent_reqs) assert get_count( faulty_node, faulty_node.processPropagate) > old_count_recv_ppg if recv_client_requests: assert get_count( faulty_node, faulty_node.processRequest) > old_count_recv_req else: assert get_count( faulty_node, faulty_node.processRequest) == old_count_recv_req # Attempt to request PROPAGATEs was made as many number of times as the # number of sent batches in both replicas since both replicas # independently request PROPAGATEs assert get_count(faulty_node, faulty_node.request_propagates) - \ old_count_request_propagates == (sum_of_sent_batches() - old_sum_of_sent_batches) requested_propagate_counts = getAllReturnVals( faulty_node, faulty_node.request_propagates) # The last attempt to request PROPAGATEs was not successful assert requested_propagate_counts[0] == 0 # The first attempt to request PROPAGATEs was successful as PROPAGATEs # were requested for all nodes assert requested_propagate_counts[1] == sent_reqs faulty_node.nodeIbStasher.reset_delays_and_process_delayeds() sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, num_reqs=4)
def new_node_in_correct_view(all_nodes_view_change, looper, txnPoolNodeSet, one_node_added, wallet1, client1): new_node = one_node_added looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, retryWait=1, timeout=10)) assert len(getAllReturnVals(new_node, new_node._start_view_change_if_possible, compare_val_to=True)) > 0 assert not new_node._next_view_indications send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2)
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes old_view_nos = set([n.viewNo for n in nodes]) assert len(old_view_nos) == 1 old_view_no = old_view_nos.pop() for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def new_node_in_correct_view(all_nodes_view_change, looper, txnPoolNodeSet, sdk_one_node_added, sdk_pool_handle, sdk_wallet_client): new_node = sdk_one_node_added looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, retryWait=1, timeout=10)) assert len(getAllReturnVals(new_node.view_changer, new_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 assert not new_node.view_changer._next_view_indications sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdirWithPoolTxns, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals(old_pr_node, old_pr_node._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node._next_view_indications
def new_node_in_correct_view(all_nodes_view_change, looper, txnPoolNodeSet, one_node_added, sdk_pool_handle, sdk_wallet_client): new_node = one_node_added looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, retryWait=1, timeout=10)) assert len(getAllReturnVals(new_node.view_changer, new_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 assert not new_node.view_changer._next_view_indications sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2)
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes for node in nodes: node.checkPerformance() assert any( getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) looper.run( eventually(partial(checkViewNoForNodes, nodes, 1), retryWait=1, timeout=20))
def test_node_request_propagates(looper, setup, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of node lacks sufficient propagates """ faulty_node, recv_client_requests = setup old_count_recv_ppg = get_count(faulty_node, faulty_node.processPropagate) old_count_recv_req = get_count(faulty_node, faulty_node.processRequest) old_count_request_propagates = get_count(faulty_node, faulty_node.request_propagates) def sum_of_sent_batches(): return faulty_node.replicas[0].lastPrePrepareSeqNo + \ faulty_node.replicas[1].lastPrePrepareSeqNo old_sum_of_sent_batches = sum_of_sent_batches() sent_reqs = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sent_reqs) assert get_count(faulty_node, faulty_node.processPropagate) > old_count_recv_ppg if recv_client_requests: assert get_count(faulty_node, faulty_node.processRequest) > old_count_recv_req else: assert get_count(faulty_node, faulty_node.processRequest) == old_count_recv_req # Attempt to request PROPAGATEs was made as many number of times as the # number of sent batches in both replicas since both replicas # independently request PROPAGATEs assert get_count(faulty_node, faulty_node.request_propagates) - \ old_count_request_propagates == (sum_of_sent_batches() - old_sum_of_sent_batches) requested_propagate_counts = getAllReturnVals( faulty_node, faulty_node.request_propagates) # The last attempt to request PROPAGATEs was not successful assert requested_propagate_counts[0] == 0 # The first attempt to request PROPAGATEs was successful as PROPAGATEs # were requested for all nodes assert requested_propagate_counts[1] == sent_reqs faulty_node.nodeIbStasher.reset_delays_and_process_delayeds() sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, num_reqs=4)
def testInstChangeWithMoreReqLat(looper, setup): # TODO for now, view_change procedure can take more that 15 minutes # (5 minutes for catchup and 10 minutes for primary's answer). # Therefore, view_change triggering by max latency now is not indicative. nodes = setup.nodes old_view_no = setup.old_view_no for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def testInstChangeWithMoreReqLat(looper, setup): # TODO for now, view_change procedure can take more that 15 minutes # (5 minutes for catchup and 10 minutes for primary's answer). # Therefore, view_change triggering by max latency now is not indicative. nodes = setup.nodes old_view_no = setup.old_view_no for node in nodes: node.checkPerformance() assert any( getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def setup(nodeSet, looper): m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) initial_view_no = waitForViewChange(looper, nodeSet) # Setting view change timeout to low value to make test pass quicker for node in nodeSet: node._view_change_timeout = view_change_timeout times = {} for node in nodeSet: times[node.name] = { 'called': get_count(node, node._check_view_change_completed), 'returned_true': len(getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) } return m_primary_node, initial_view_no, times
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdir, allPluginsPath, tconf, sdk_pool_handle, sdk_wallet_client): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_view_change(looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=30)) assert len( getAllReturnVals( restarted_node.view_changer, restarted_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) assert not restarted_node.view_changer._next_view_indications
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) assert len(getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_view_change_retry_by_timeout( txnPoolNodeSet, looper, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup delay_view_change_done_msg(txnPoolNodeSet) start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=view_change_timeout + 2) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(txnPoolNodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time for node in txnPoolNodeSet: assert get_count(node, node._check_view_change_completed) - \ timeout_callback_stats[node.name]['called'] == 1 assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) - \ timeout_callback_stats[node.name]['returned_true'] == 1 # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_node_request_propagates(looper, setup, txnPoolNodeSet, client1, wallet1, client1Connected, request): """ One of node lacks sufficient propagates """ faulty_node, recv_client_requests = setup old_count_recv_ppg = get_count(faulty_node, faulty_node.processPropagate) old_count_recv_req = get_count(faulty_node, faulty_node.processRequest) old_count_request_propagates = get_count(faulty_node, faulty_node.request_propagates) sent_reqs = 5 send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, sent_reqs) assert get_count(faulty_node, faulty_node.processPropagate) > old_count_recv_ppg if recv_client_requests: assert get_count(faulty_node, faulty_node.processRequest) > old_count_recv_req else: assert get_count(faulty_node, faulty_node.processRequest) == old_count_recv_req # Attempt to request PROPAGATEs was made twice, since the faulty node has 2 replicas assert get_count( faulty_node, faulty_node.request_propagates) - old_count_request_propagates == 2 requested_propagate_counts = getAllReturnVals( faulty_node, faulty_node.request_propagates) # The last attempt to request PROPAGATEs was not successful assert requested_propagate_counts[0] == 0 # The first attempt to request PROPAGATEs was successful as PROPAGATEs # were requested for all nodes assert requested_propagate_counts[1] == sent_reqs faulty_node.nodeIbStasher.reset_delays_and_process_delayeds() ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1, 4)
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdir, allPluginsPath, tconf, sdk_pool_handle, sdk_wallet_client): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_view_change(looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=30)) assert len(getAllReturnVals(restarted_node.view_changer, restarted_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) assert not restarted_node.view_changer._next_view_indications
def test_view_change_timeout(nodeSet, looper, up, setup, wallet1, client1): """ Check view change restarted if it is not completed in time """ m_primary_node, initial_view_no, times = setup delay_view_change_msg(nodeSet) start_view_change(nodeSet, initial_view_no + 1) # First view change should fail, because of delayed # instance change messages. This then leads to new view change that we # need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=nodeSet, customTimeout=10) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(nodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=nodeSet) ensure_all_nodes_have_same_data(looper, nodes=nodeSet) new_m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) assert m_primary_node.name != new_m_primary_node.name # The timeout method has been called at least once for node in nodeSet: assert get_count( node, node._check_view_change_completed) > times[node.name]['called'] assert len( getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) > times[node.name]['returned_true'] # Multiple view changes have been initiated for node in nodeSet: assert (node.viewNo - initial_view_no) > 1 ensure_pool_functional(looper, nodeSet, wallet1, client1)
def chk4(): # Some COMMITs were ordered but stashed and they were processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert delay_batches in rv
def test_reverted_unordered(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Before starting catchup, revert any uncommitted changes to state and ledger. This is to avoid any re-application of requests that were ordered but stashed Example scenario prepared (1, 4) startViewChange start_catchup ... .... ... committed and send Ordered (1, 2) ... .... preLedgerCatchUp force_process_ordered, take out (1,2) and stash (1, 2) now process stashed Ordered(1,2), its requests will be applied again Simulation: Delay COMMITs to a node so that it can not order requests but has prepared them. Then trigger a view change and make sure the slow node has not ordered same number of requests as others but has prepared so it can order when it receives COMMITs while view change is in progress. The slow node should revert unordered batches and but it should eventually process the ordered requests, so delay LEDGER_STATUS too so catchup is delayed """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(cDelay(120, 0)) sent_batches = 5 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) # Fast nodes have same last ordered and same data last_ordered = [n.master_last_ordered_3PC for n in fast_nodes] assert check_if_all_equal_in_list(last_ordered) ensure_all_nodes_have_same_data(looper, fast_nodes) # Slow nodes have different last ordered than fast nodes assert last_ordered[0] != slow_node.master_last_ordered_3PC # Delay LEDGER_STATUS so catchup starts late slow_node.nodeIbStasher.delay(lsDelay(100)) slow_node.nodeIbStasher.delay(msg_rep_delay(100)) # slow_node has not reverted batches assert sent_batches not in getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) ensure_view_change(looper, txnPoolNodeSet) def chk1(): # slow_node reverted all batches rv = getAllReturnVals(slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv looper.run(eventually(chk1, retryWait=1)) # After the view change slow_node has prepared same requests as the fast # nodes have ordered assert last_ordered[0] == slow_node.master_replica.last_prepared_before_view_change # Deliver COMMITs slow_node.nodeIbStasher.reset_delays_and_process_delayeds(COMMIT) def chk2(): # slow_node orders all requests as others have assert last_ordered[0] == slow_node.master_last_ordered_3PC looper.run(eventually(chk2, retryWait=1)) # Deliver LEDGER_STATUS so catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds(LEDGER_STATUS) slow_node.nodeIbStasher.reset_delays_and_process_delayeds(MESSAGE_RESPONSE) # Ensure all nodes have same data ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv looper.run(eventually(chk3, retryWait=1)) # Ensure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def chk1(): # slow_node reverted all batches rv = getAllReturnVals( slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv
def count_requested_preprepare_resp(node): # Returns the number of times PRE-PREPARE was requested sr = node.master_replica return len(getAllReturnVals(sr, sr._request_pre_prepare_for_prepare, compare_val_to=True))
def chk3(): # slow_node processed stashed Ordered requests successfully rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv
def caught_up_for_current_view_count(): return len( getAllReturnVals(bad_node, bad_node.caught_up_for_current_view, compare_val_to=True))
def is_catchup_not_needed_count(): return len(getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=False))
def is_catchup_needed_count(): return len( getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True))
def has_ordered_till_last_prepared_certificate_count(): return len(getAllReturnVals(bad_node, bad_node.has_ordered_till_last_prepared_certificate, compare_val_to=True))
def _check_view_change_completed_true(node): return len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True))
def chk1(): # slow_node reverted all batches rv = getAllReturnVals(slow_node.master_replica, slow_node.master_replica.revert_unordered_batches) assert sent_batches in rv