def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet viewNoBefore = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary stopNodes([old_pr_node], looper) looper.removeProdable(old_pr_node) remainingNodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remainingNodes, viewNoBefore + 1) ensure_all_nodes_have_same_data(looper, nodes=remainingNodes) new_pr_node = get_master_primary_node(remainingNodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def test_propagate_primary_after_primary_restart_view_0( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indexes correctly case viewNo == 0 """ sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [ i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name ][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_view_change_timeout(nodeSet, looper, up, wallet1, client1): """ Check view change restarted if it is not completed in time """ m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) initial_view_no = waitForViewChange(looper, nodeSet) # Setting view change timeout to low value to make test pass quicker for node in nodeSet: node._view_change_timeout = 5 # Delaying view change messages to make first view change fail # due to timeout for node in nodeSet: node.nodeIbStasher.delay(vcd_delay(delay=50)) # Delaying preprepae messages from nodes and # sending request to force view change #for i in range(3): # delayNonPrimaries(nodeSet, instId=i, delay=10) #sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) times = {} for node in nodeSet: times[node.name] = { 'called': get_count(node, node._check_view_change_completed), 'returned_true': len(getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) } for node in nodeSet: node.startViewChange(initial_view_no + 1) # First view change should fail, because of delayed # instance change messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=nodeSet, customTimeout=10) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(nodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=nodeSet) ensure_all_nodes_have_same_data(looper, nodes=nodeSet) new_m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) assert m_primary_node.name != new_m_primary_node.name # The timeout method has been called at least once for node in nodeSet: assert get_count(node, node._check_view_change_completed) > times[node.name]['called'] assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) > times[node.name]['returned_true'] # Multiple view changes have been initiated for node in nodeSet: assert (node.viewNo - initial_view_no) > 1 ensure_pool_functional(looper, nodeSet, wallet1, client1)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) # After node catches up it set view_no from audit ledger and do not need to do view_change assert len( getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer.start_view_change, compare_val_to=True)) == 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} for n in txnPoolNodeSet: sentInstChanges[n.name] = node_sent_instance_changes_count(n) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) backup_replica = txnPoolNodeSet[0].replicas[1] backup_last_ordered_before = backup_replica.last_ordered_3pc sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) # make sure that backups also ordered at least 1 batch to be able to track performance degradation looper.run( eventually(lambda: assertExp(backup_replica.last_ordered_3pc > backup_last_ordered_before))) for n in txnPoolNodeSet: n.checkPerformance() # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert node_sent_instance_changes_count(n) > sentInstChanges.get( n.name, 0) else: assert node_sent_instance_changes_count(n) == sentInstChanges.get( n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdirWithPoolTxns, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals(old_pr_node, old_pr_node._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node._next_view_indications
def test_propagate_primary_after_primary_restart_view_1( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indices correctly case viewNo > 0 """ ensure_view_change(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, expectedViewNo=1) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count( instChngMethodName) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) for n in txnPoolNodeSet: n.checkPerformance() # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert n.view_changer.spylog.count(instChngMethodName) > \ sentInstChanges.get(n.name, 0) else: assert n.view_changer.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_instance_change_before_vc(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 panic_node = txnPoolNodeSet[-1] panic_node.view_changer.on_master_degradation() def has_inst_chng_in_validator_info(): for node in txnPoolNodeSet: latest_info = node._info_tool.info ic_queue = latest_info['Node_info']['View_change_status']['IC_queue'] assert expected_view_no in ic_queue assert ic_queue[expected_view_no]["Voters"][panic_node.name]['reason'] == Suspicions.PRIMARY_DEGRADED.code looper.run(eventually(has_inst_chng_in_validator_info)) for node in txnPoolNodeSet: node.view_changer.on_master_degradation() looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, expected_view_no, retryWait=1, timeout=tconf.NEW_VIEW_TIMEOUT)) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet) def is_inst_chngs_cleared(): for node in txnPoolNodeSet: latest_info = node._info_tool.info assert latest_info['Node_info']['View_change_status']['IC_queue'] == {} looper.run(eventually(is_inst_chngs_cleared))
def split_nodes(nodes): primary_node = get_master_primary_node(nodes) slow_node = getNonPrimaryReplicas(nodes, 0)[-1].node other_nodes = [n for n in nodes if n != slow_node] other_non_primary_nodes = [n for n in nodes if n not in (slow_node, primary_node)] return slow_node, other_nodes, primary_node, other_non_primary_nodes
def test_view_change_retry_by_timeout( txnPoolNodeSet, looper, tconf, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, vcd_delay()): start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=1.5 * VIEW_CHANGE_TIMEOUT) # Now as ViewChangeDone messages are unblocked view changes should finish successfully ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time check_watchdog_called_expected_times(txnPoolNodeSet, timeout_callback_stats, 1) # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=False) looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet[1:], expected_view_no, retryWait=1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_txns) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, master_node) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:]) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status'][ 'Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status'][ 'View_No'] == expected_view_no node_names = [n.name for n in txnPoolNodeSet[1:]] for node_name in node_names: assert latest_info['Node_info']['View_change_status']['VCDone_queue'][ node_name][0] == master_node.master_primary_name assert latest_info['Node_info']['View_change_status']['VCDone_queue'][ node_name][1] assert latest_info['Node_info']['View_change_status'][ 'Last_complete_view_no'] == expected_view_no
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas
def test_view_change_retry_by_timeout(txnPoolNodeSet, looper, tconf, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, nv_delay()): start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=1.5 * NEW_VIEW_TIMEOUT) # Now as ViewChangeDone messages are unblocked view changes should finish successfully ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time check_watchdog_called_expected_times(txnPoolNodeSet, timeout_callback_stats, 1) # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) master_node_index = txnPoolNodeSet.index(master_node) other_nodes = txnPoolNodeSet.copy() other_nodes.remove(master_node) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=True) looper.removeProdable(master_node) looper.run(eventually(checkViewNoForNodes, other_nodes, expected_view_no, retryWait=1, timeout=tconf.NEW_VIEW_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, other_nodes, sdk_pool_handle, sdk_wallet_steward, num_txns) master_node = start_stopped_node(master_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[master_node_index] = master_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:], exclude_from_check=['check_last_ordered_3pc_backup']) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status']['Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status']['View_No'] == expected_view_no for n in other_nodes: assert n._info_tool.info['Node_info']['View_change_status']['Last_complete_view_no'] == expected_view_no
def test_primary_receives_delayed_prepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Primary gets all PREPAREs after COMMITs """ delay = 50 primary_node = get_master_primary_node(txnPoolNodeSet) other_nodes = [n for n in txnPoolNodeSet if n != primary_node] primary_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=10) for node in other_nodes: assert node.master_replica.prePrepares assert node.master_replica.prepares assert node.master_replica.commits assert primary_node.master_replica.sentPrePrepares assert not primary_node.master_replica.prepares assert primary_node.master_replica.commits
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count(instChngMethodName) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert n.view_changer.spylog.count(instChngMethodName) > \ sentInstChanges.get(n.name, 0) else: assert n.view_changer.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_view_change_on_performance_degraded(looper, nodeSet, up, viewNo, wallet1, client1): """ Test that a view change is done when the performance of master goes down Send multiple requests from the client and delay some requests by master instance so that there is a view change. All nodes will agree that master performance degraded """ old_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) simulate_slow_master(looper, nodeSet, wallet1, client1) waitForViewChange(looper, nodeSet, expectedViewNo=viewNo + 1) ensureElectionsDone(looper=looper, nodes=nodeSet) ensure_all_nodes_have_same_data(looper, nodes=nodeSet) new_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) assert old_primary_node.name != new_primary_node.name
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) assert len(getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def simulate_slow_master(looper, nodeSet, wallet, client, delay=10, num_reqs=4): m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(nodeSet, 0, delay) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, num_reqs) return m_primary_node
def simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, delay=10, num_reqs=4): m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, delay) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_reqs) return m_primary_node
def test_view_change_on_performance_degraded(looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): """ Test that a view change is done when the performance of master goes down Send multiple requests from the client and delay some requests by master instance so that there is a view change. All nodes will agree that master performance degraded """ old_primary_node = get_master_primary_node(list(txnPoolNodeSet)) simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert old_primary_node.name != new_primary_node.name
def test_view_change_on_performance_degraded(looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): """ Test that a view change is done when the performance of master goes down Send multiple requests from the client and delay some requests by master instance so that there is a view change. All nodes will agree that master performance degraded """ old_primary_node = get_master_primary_node(list(txnPoolNodeSet)) trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert old_primary_node.name != new_primary_node.name waitNodeDataEquality(looper, *txnPoolNodeSet)
def test_view_not_changed_when_short_disconnection(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ When primary is disconnected but not long enough to trigger the timeout, view change should not happen """ pr_node = get_master_primary_node(txnPoolNodeSet) view_no = checkViewNoForNodes(txnPoolNodeSet) prp_inst_chg_calls = { node.name: node.spylog.count(node.propose_view_change.__name__) for node in txnPoolNodeSet if node != pr_node } recv_inst_chg_calls = { node.name: node.spylog.count( node.view_changer.process_instance_change_msg.__name__) for node in txnPoolNodeSet if node != pr_node } # Disconnect master's primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, pr_node, timeout=2) txnPoolNodeSet.remove(pr_node) looper.removeProdable(name=pr_node.name) timeout = min(tconf.ToleratePrimaryDisconnection - 1, 1) # Reconnect master's primary pr_node = start_stopped_node(pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(pr_node) def chk2(): # Schedule an instance change but do not send it # since primary joins again for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.propose_view_change.__name__ ) > prp_inst_chg_calls[node.name] assert node.view_changer.spylog.count(node.view_changer.process_instance_change_msg.__name__) == \ recv_inst_chg_calls[node.name] looper.run(eventually(chk2, retryWait=.2, timeout=timeout + 1)) assert checkViewNoForNodes(txnPoolNodeSet) == view_no # Send some requests and make sure the request execute sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5)
def test_choose_ts_from_state(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) primary_node = get_master_primary_node(txnPoolNodeSet) excpected_ts = get_utc_epoch() + 30 req_handler = primary_node.get_req_handler(DOMAIN_LEDGER_ID) req_handler.ts_store.set(excpected_ts, req_handler.state.headHash) primary_node.master_replica.last_accepted_pre_prepare_time = None reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] assert abs(excpected_ts - int(get_txn_time(reply['result']))) < 3
def setup(nodeSet, looper): m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) initial_view_no = waitForViewChange(looper, nodeSet) # Setting view change timeout to low value to make test pass quicker for node in nodeSet: node._view_change_timeout = view_change_timeout times = {} for node in nodeSet: times[node.name] = { 'called': get_count(node, node._check_view_change_completed), 'returned_true': len(getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) } return m_primary_node, initial_view_no, times
def malicious_setup(request, txnPoolNodeSet): primary_node = get_master_primary_node(txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] bad_node = [n for n in other_nodes if n != primary_node][0] good_non_primary_node = [ n for n in other_nodes if n != slow_node and n != bad_node and n != primary_node ][0] if request.param == 'do_not_send': orig_method = bad_node.nodeMsgRouter.routes[MessageReq] def do_not_send(self, msg, frm): if msg.msg_type == PREPREPARE: return else: return orig_method(msg, frm) bad_node.nodeMsgRouter.routes[MessageReq] = types.MethodType( do_not_send, bad_node) return primary_node, bad_node, good_non_primary_node, slow_node, \ other_nodes, do_not_send, orig_method if request.param == 'send_bad': orig_method = bad_node.nodeMsgRouter.routes[MessageReq] def send_bad(self, msg, frm): if msg.msg_type == PREPREPARE: resp = self.replicas[msg.params['instId']].getPrePrepare( msg.params['viewNo'], msg.params['ppSeqNo']) resp = updateNamedTuple(resp, digest='11908ffq') self.sendToNodes(MessageRep( **{ f.MSG_TYPE.nm: msg.msg_type, f.PARAMS.nm: msg.params, f.MSG.nm: resp }), names=[ frm, ]) else: return orig_method(msg, frm) bad_node.nodeMsgRouter.routes[MessageReq] = types.MethodType( send_bad, bad_node) return primary_node, bad_node, good_non_primary_node, slow_node, \ other_nodes, send_bad, orig_method
def test_view_change_retry_by_timeout( txnPoolNodeSet, looper, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup delay_view_change_done_msg(txnPoolNodeSet) start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=view_change_timeout + 2) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(txnPoolNodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time for node in txnPoolNodeSet: assert get_count(node, node._check_view_change_completed) - \ timeout_callback_stats[node.name]['called'] == 1 assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) - \ timeout_callback_stats[node.name]['returned_true'] == 1 # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_primary_receives_delayed_prepares(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ Primary gets all PREPAREs after COMMITs """ delay = 50 primary_node = get_master_primary_node(txnPoolNodeSet) other_nodes = [n for n in txnPoolNodeSet if n != primary_node] primary_node.nodeIbStasher.delay(pDelay(delay, 0)) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10) for node in other_nodes: assert node.master_replica.prePrepares assert node.master_replica.prepares assert node.master_replica.commits assert primary_node.master_replica.sentPrePrepares assert not primary_node.master_replica.prepares assert primary_node.master_replica.commits
def test_choose_ts_from_state(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) primary_node = get_master_primary_node(txnPoolNodeSet) excpected_ts = get_utc_epoch() + 30 req_handler = primary_node.get_req_handler(DOMAIN_LEDGER_ID) req_handler.ts_store.set(excpected_ts, req_handler.state.headHash) primary_node.master_replica.last_accepted_pre_prepare_time = None reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] assert abs(excpected_ts - int(get_txn_time(reply['result']))) < 3
def test_choose_ts_from_state(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) primary_node = get_master_primary_node(txnPoolNodeSet) excpected_ts = get_utc_epoch() + 30 req_handler = primary_node.write_manager.request_handlers[NYM][0] req_handler.database_manager.ts_store.set(excpected_ts, req_handler.state.headHash) primary_node.master_replica._ordering_service.last_accepted_pre_prepare_time = None reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] assert abs(excpected_ts - int(get_txn_time(reply['result']))) < 3
def test_view_change_timeout(nodeSet, looper, up, setup, wallet1, client1): """ Check view change restarted if it is not completed in time """ m_primary_node, initial_view_no, times = setup delay_view_change_msg(nodeSet) start_view_change(nodeSet, initial_view_no + 1) # First view change should fail, because of delayed # instance change messages. This then leads to new view change that we # need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=nodeSet, customTimeout=10) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(nodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=nodeSet) ensure_all_nodes_have_same_data(looper, nodes=nodeSet) new_m_primary_node = get_master_primary_node(list(nodeSet.nodes.values())) assert m_primary_node.name != new_m_primary_node.name # The timeout method has been called at least once for node in nodeSet: assert get_count( node, node._check_view_change_completed) > times[node.name]['called'] assert len( getAllReturnVals( node, node._check_view_change_completed, compare_val_to=True)) > times[node.name]['returned_true'] # Multiple view changes have been initiated for node in nodeSet: assert (node.viewNo - initial_view_no) > 1 ensure_pool_functional(looper, nodeSet, wallet1, client1)
def test_view_change_on_start(tconf, txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Do view change on a without any requests """ old_view_no = txnPoolNodeSet[0].viewNo master_primary = get_master_primary_node(txnPoolNodeSet) other_nodes = [n for n in txnPoolNodeSet if n != master_primary] delay_3pc = 10 delay_3pc_messages(txnPoolNodeSet, 0, delay_3pc) sent_batches = 2 sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, sent_batches * tconf.Max3PCBatchSize) def chk1(): t_root, s_root = check_uncommitteds_equal(other_nodes) assert master_primary.domainLedger.uncommittedRootHash != t_root assert master_primary.states[DOMAIN_LEDGER_ID].headHash != s_root looper.run(eventually(chk1, retryWait=1)) timeout = tconf.PerfCheckFreq + \ waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, old_view_no + 1, customTimeout=timeout) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) check_uncommitteds_equal(txnPoolNodeSet) reset_delays_and_process_delayeds(txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize, add_delay_to_timeout=delay_3pc) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=False) looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet[1:], expected_view_no, retryWait=1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_txns) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, master_node) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:]) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status']['Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status']['View_No'] == expected_view_no node_names = [n.name for n in txnPoolNodeSet[1:]] for node_name in node_names: assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][0] == master_node.master_primary_name assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][1] assert latest_info['Node_info']['View_change_status']['Last_complete_view_no'] == expected_view_no
def test_primary_receives_delayed_prepares(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Primary gets all PREPAREs after COMMITs """ delay = 50 primary_node = get_master_primary_node(txnPoolNodeSet) other_nodes = [n for n in txnPoolNodeSet if n != primary_node] primary_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=10) for node in other_nodes: assert node.master_replica.prePrepares assert node.master_replica.prepares assert node.master_replica.commits assert primary_node.master_replica.sentPrePrepares assert not primary_node.master_replica.prepares assert primary_node.master_replica.commits
def setup(txnPoolNodeSet, looper): m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) initial_view_no = waitForViewChange(looper, txnPoolNodeSet) timeout_callback_stats = _check_view_change_completed_stats(txnPoolNodeSet) return m_primary_node, initial_view_no, timeout_callback_stats
def setup(txnPoolNodeSet, looper): m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) initial_view_no = waitForViewChange(looper, txnPoolNodeSet) timeout_callback_stats = _check_view_change_completed_stats(txnPoolNodeSet) return m_primary_node, initial_view_no, timeout_callback_stats
def disconnect_master_primary(nodes): pr_node = get_master_primary_node(nodes) for node in nodes: if node != pr_node: node.nodestack.getRemote(pr_node.nodestack.name).disconnect() return pr_node
def slow_nodes(node_set): return [get_master_primary_node(node_set), get_first_master_non_primary_node(node_set)]
def test_view_not_changed_when_short_disconnection(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tconf): """ When primary is disconnected but not long enough to trigger the timeout, view change should not happen """ pr_node = get_master_primary_node(txnPoolNodeSet) view_no = checkViewNoForNodes(txnPoolNodeSet) lost_pr_calls = {node.name: node.spylog.count( node.lost_master_primary.__name__) for node in txnPoolNodeSet if node != pr_node} prp_inst_chg_calls = {node.name: node.spylog.count( node.propose_view_change.__name__) for node in txnPoolNodeSet if node != pr_node} recv_inst_chg_calls = {node.name: node.spylog.count( node.view_changer.process_instance_change_msg.__name__) for node in txnPoolNodeSet if node != pr_node} def chk1(): # Check that non-primary nodes detects losing connection with # primary for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.lost_master_primary.__name__) \ > lost_pr_calls[node.name] def chk2(): # Schedule an instance change but do not send it # since primary joins again for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.propose_view_change.__name__) \ > prp_inst_chg_calls[node.name] assert node.view_changer.spylog.count(node.view_changer.process_instance_change_msg.__name__) \ == recv_inst_chg_calls[node.name] # Disconnect master's primary for node in txnPoolNodeSet: if node != pr_node: node.nodestack.getRemote(pr_node.nodestack.name).disconnect() timeout = min(tconf.ToleratePrimaryDisconnection - 1, 1) looper.run(eventually(chk1, retryWait=.2, timeout=timeout)) # Reconnect master's primary for node in txnPoolNodeSet: if node != pr_node: node.nodestack.retryDisconnected() looper.run(eventually(chk2, retryWait=.2, timeout=timeout + 1)) def chk3(): # Check the view does not change with pytest.raises(AssertionError): assert checkViewNoForNodes(txnPoolNodeSet) == view_no + 1 looper.run(eventually(chk3, retryWait=1, timeout=10)) # Send some requests and make sure the request execute sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5)
def test_view_change_after_back_to_quorum_with_disconnected_primary(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): assert len(txnPoolNodeSet) == 4 pr_node = get_master_primary_node(txnPoolNodeSet) assert pr_node.name == "Alpha" # 1. Initiate view change be primary (Alpha) restart nodes = ensure_view_change_by_primary_restart(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # Now primary should be Beta pr_node = get_master_primary_node(nodes) assert pr_node.name == "Beta" # 2. Stop non-primary node Delta, no any view changes are expected non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0] disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, non_primary_to_stop) looper.removeProdable(non_primary_to_stop) remaining_nodes = list(set(nodes) - {non_primary_to_stop}) # Primary is going to be stopped, remember instance change messages count # to ensure that no view change happened as number of connected nodes is less # than quorum. ic_cnt = {} for n in remaining_nodes: ic_cnt[n.name] = n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__) # 3. Disconnect primary disconnect_node_and_ensure_disconnected( looper, remaining_nodes, pr_node) looper.removeProdable(pr_node) # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented. looper.runFor(tconf.ToleratePrimaryDisconnection + 5) remaining_nodes = list(set(remaining_nodes) - {pr_node}) for n in remaining_nodes: assert ic_cnt[n.name] == n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__) view_no = checkViewNoForNodes(remaining_nodes) # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum # to choose a new one. restartedNode = start_stopped_node(non_primary_to_stop, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) remaining_nodes = remaining_nodes + [restartedNode] # 5. Check that view change happened. waitForViewChange(looper, remaining_nodes, expectedViewNo=(view_no + 1), customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle, sdk_wallet_client, 3) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def test_view_not_changed_when_primary_disconnected_from_less_than_quorum( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Less than quorum nodes lose connection with primary, this should not trigger view change as the protocol can move ahead """ pr_node = get_master_primary_node(txnPoolNodeSet) npr = getNonPrimaryReplicas(txnPoolNodeSet, 0) partitioned_rep = npr[0] partitioned_node = partitioned_rep.node lost_pr_calls = partitioned_node.spylog.count( partitioned_node.lost_master_primary.__name__) recv_inst_chg_calls = {node.name: node.spylog.count( node.view_changer.process_instance_change_msg.__name__) for node in txnPoolNodeSet if node != partitioned_node and node != pr_node} view_no = checkViewNoForNodes(txnPoolNodeSet) orig_retry_meth = partitioned_node.nodestack.retryDisconnected def wont_retry(self, exclude=None): # Do not attempt to retry connection pass # simulating a partition here # Disconnect a node from only the primary of the master and dont retry to # connect to it partitioned_node.nodestack.retryDisconnected = types.MethodType( wont_retry, partitioned_node.nodestack) r = partitioned_node.nodestack.getRemote(pr_node.nodestack.name) r.disconnect() def chk1(): # Check that the partitioned node detects losing connection with # primary and sends an instance change which is received by other # nodes except the primary (since its disconnected from primary) assert partitioned_node.spylog.count( partitioned_node.lost_master_primary.__name__) > lost_pr_calls for node in txnPoolNodeSet: if node != partitioned_node and node != pr_node: assert node.view_changer.spylog.count( node.view_changer.process_instance_change_msg.__name__) > recv_inst_chg_calls[node.name] looper.run(eventually(chk1, retryWait=1, timeout=10)) def chk2(): # Check the view does not change with pytest.raises(AssertionError): assert checkViewNoForNodes(txnPoolNodeSet) == view_no + 1 looper.run(eventually(chk2, retryWait=1, timeout=10)) # Send some requests and make sure the request execute sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Repair the connection so the node is no longer partitioned partitioned_node.nodestack.retryDisconnected = types.MethodType( orig_retry_meth, partitioned_node.nodestack) # Send some requests and make sure the request execute sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Partitioned node should have the same ledger and state as others # eventually waitNodeDataEquality(looper, partitioned_node, *[n for n in txnPoolNodeSet if n != partitioned_node])