def test_watermarks_after_view_change(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Delay commit, checkpoint, InstanceChange and ViewChangeDone messages for lagging_node. Start ViewChange. Check that ViewChange finished. Reset delays. Check that lagging_node can order transactions and has same data with other nodes. """ lagging_node = txnPoolNodeSet[-1] lagging_node.master_replica.config.LOG_SIZE = LOG_SIZE start_view_no = lagging_node.viewNo with delay_rules(lagging_node.nodeIbStasher, cDelay(), chk_delay(), icDelay(), nv_delay()): trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet[:-1], expectedViewNo=start_view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet))) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet[:-1]) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 6) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testSuspendNode(looper, sdk_pool_handle, sdk_wallet_trustee, nodeSet, tdir, tconf, allPluginsPath): """ Suspend a node and then cancel suspension. Suspend while suspended to test that there is no error """ start_view_no = nodeSet[0].viewNo new_steward_wallet, new_node = sdk_node_theta_added( looper, nodeSet, tdir, tconf, sdk_pool_handle, sdk_wallet_trustee, allPluginsPath, node_config_helper_class=NodeConfigHelper, testNodeClass=TestNode, name="Node-" + randomString(5)) waitForViewChange(looper=looper, txnPoolNodeSet=nodeSet, expectedViewNo=start_view_no + 1) ensureElectionsDone(looper=looper, nodes=nodeSet) demote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node) _wait_view_change_finish(looper, nodeSet[:-1], start_view_no + 1) demote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node) promote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node) _wait_view_change_finish(looper, nodeSet[:-1], start_view_no + 2) promote_node(looper, sdk_wallet_trustee, sdk_pool_handle, new_node)
def testDoNotSendInstChngMsgIfMasterDoesntSeePerformanceProblem( txnPoolNodeSet, looper, ensureView): """ A node that received an INSTANCE_CHANGE message must not send an INSTANCE_CHANGE message if it doesn't observe too much difference in performance between its replicas. """ curViewNo = ensureView # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count( instChngMethodName) # Send an instance change message to all nodes icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg( curViewNo, 0) txnPoolNodeSet[0].send(icMsg) # Check that that message is discarded. waitForViewChange(looper, txnPoolNodeSet) # No node should have sent a view change and thus must not have called # `sendInstanceChange` for n in txnPoolNodeSet: assert n.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode): # emulate catchup by setting non-synced status for node in txnPoolNodeSet: node.mode = mode check_instance_change_count(txnPoolNodeSet, 0) # start View Change old_view_no = checkViewNoForNodes(txnPoolNodeSet) old_meths = do_view_change(txnPoolNodeSet) for node in txnPoolNodeSet: node.view_changer.sendInstanceChange(old_view_no + 1) # make sure View Change is not started check_no_view_change(looper, txnPoolNodeSet) assert old_view_no == checkViewNoForNodes(txnPoolNodeSet) # emulate finishing of catchup by setting Participating status revert_do_view_change(txnPoolNodeSet, old_meths) for node in txnPoolNodeSet: node.mode = Mode.participating # make sure that View Change happened waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_replica_removing_before_vc_with_primary_disconnected( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica 2. Reconnect master primary 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = txnPoolNodeSet[0] start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) _check_replica_removed(node, start_replicas_count, instance_id) assert not node.monitor.isMasterDegraded() assert len(node.requests) == 0 # trigger view change on all nodes disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node) txnPoolNodeSet.remove(node) looper.removeProdable(node) node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas
def testDiscardInstChngMsgFrmPastView(txnPoolNodeSet, looper, ensureView): """ Once a view change is done, any further INSTANCE_CHANGE messages for that view must be discarded by the node. """ curViewNo = ensureView # Send an instance change for an old instance message to all nodes icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg( curViewNo, 0) txnPoolNodeSet[0].send(icMsg) # ensure every node but Alpha discards the invalid instance change request timeout = waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet)) # Check that that message is discarded. looper.run( eventually(checkDiscardMsg, txnPoolNodeSet, icMsg, 'which is not more than its view no', txnPoolNodeSet[0], timeout=timeout)) waitForViewChange(looper, txnPoolNodeSet)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode): # emulate catchup by setting non-synced status for node in txnPoolNodeSet: node.mode = mode check_instance_change_count(txnPoolNodeSet, 0) # start View Change old_view_no = checkViewNoForNodes(txnPoolNodeSet) old_meths = do_view_change(txnPoolNodeSet) for node in txnPoolNodeSet: node.view_changer.sendInstanceChange(old_view_no + 1) # make sure View Change is not started check_no_view_change(looper, txnPoolNodeSet) assert old_view_no == checkViewNoForNodes(txnPoolNodeSet) # emulate finishing of catchup by setting Participating status revert_do_view_change(txnPoolNodeSet, old_meths) for node in txnPoolNodeSet: node.mode = Mode.participating # make sure that View Change happened waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_nodes_make_view_change_only_on_master_suspicious( looper, txnPoolNodeSet): old_view = txnPoolNodeSet[0].viewNo master_primary = txnPoolNodeSet[0].replicas[0] backup_primary = txnPoolNodeSet[1].replicas[1] assert master_primary.isPrimary is True assert backup_primary.isPrimary is True master_primary._ordering_service.replica_batch_digest = lambda reqs: 'asd' backup_primary._ordering_service.replica_batch_digest = lambda reqs: 'asd' non_primary_backup = txnPoolNodeSet[0].replicas[1] old_pp = non_primary_backup.spylog.count(non_primary_backup._ordering_service.process_preprepare) def pp_processed(replica, old_pp): assert replica._ordering_service.spylog.count(replica._ordering_service.process_preprepare) == old_pp + 1 backup_primary._ordering_service._do_send_3pc_batch(DOMAIN_LEDGER_ID) looper.run(eventually(pp_processed, non_primary_backup, old_pp)) assert all(node.view_changer.spylog.count(ViewChanger.sendInstanceChange) == 0 for node in txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, old_view) non_primary_master = txnPoolNodeSet[1].replicas[0] old_pp = non_primary_master.spylog.count(non_primary_master._ordering_service.process_preprepare) master_primary._ordering_service._do_send_3pc_batch(DOMAIN_LEDGER_ID) looper.run(eventually(pp_processed, non_primary_master, old_pp)) waitForViewChange(looper, txnPoolNodeSet, old_view + 1)
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet viewNoBefore = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary stopNodes([old_pr_node], looper) looper.removeProdable(old_pr_node) remainingNodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remainingNodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remainingNodes, viewNoBefore + 1) ensure_all_nodes_have_same_data(looper, nodes=remainingNodes) new_pr_node = get_master_primary_node(remainingNodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode): # emulate catchup by setting non-synced status for node in txnPoolNodeSet: node.mode = mode check_stashed_instance_changes(txnPoolNodeSet, 0) # start View Change old_view_no = checkViewNoForNodes(txnPoolNodeSet) old_meths = do_view_change(txnPoolNodeSet) for node in txnPoolNodeSet: vct_service = node.master_replica._view_change_trigger_service vct_service._send_instance_change(old_view_no + 1, Suspicions.PRIMARY_DEGRADED) # make sure View Change is not started check_no_view_change(looper, txnPoolNodeSet) assert old_view_no == checkViewNoForNodes(txnPoolNodeSet) # emulate finishing of catchup by setting Participating status revert_do_view_change(txnPoolNodeSet, old_meths) for node in txnPoolNodeSet: node.mode = Mode.participating node.master_replica.stasher.process_all_stashed(STASH_CATCH_UP) # make sure that View Change happened waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas
def test_demote_promote_restart_after_promotion_7_nodes(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath): demoted_node = txnPoolNodeSet[-1] rest_nodes = [n for n in txnPoolNodeSet if n != demoted_node] starting_view_no = checkViewNoForNodes(txnPoolNodeSet) demote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node) waitForViewChange(looper, rest_nodes, expectedViewNo=starting_view_no + 1) ensureElectionsDone(looper, rest_nodes) ensure_all_nodes_have_same_data(looper, rest_nodes) sdk_send_random_and_check(looper, rest_nodes, sdk_pool_handle, sdk_wallet_steward, 5) starting_view_no = checkViewNoForNodes(rest_nodes) promote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node) waitForViewChange(looper, rest_nodes, expectedViewNo=starting_view_no + 1) ensureElectionsDone(looper, rest_nodes, instances_list=[0, 1, 2]) ensure_all_nodes_have_same_data(looper, rest_nodes) restart_node(looper, txnPoolNodeSet, demoted_node, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def testDoNotSendInstChngMsgIfMasterDoesntSeePerformanceProblem( txnPoolNodeSet, looper, ensureView): """ A node that received an INSTANCE_CHANGE message must not send an INSTANCE_CHANGE message if it doesn't observe too much difference in performance between its replicas. """ curViewNo = ensureView # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count(instChngMethodName) # Send an instance change message to all nodes icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(curViewNo, 0) txnPoolNodeSet[0].send(icMsg) # Check that that message is discarded. waitForViewChange(looper, txnPoolNodeSet) # No node should have sent a view change and thus must not have called # `sendInstanceChange` for n in txnPoolNodeSet: assert n.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_recover_stop_primaries(looper, checkpoint_size, txnPoolNodeSet, allPluginsPath, tdir, tconf, client1, wallet1, client1Connected): """ Test that we can recover after having more than f nodes disconnected: - stop current master primary (Alpha) - send txns - restart current master primary (Beta) - send txns """ active_nodes = list(txnPoolNodeSet) assert 4 == len(active_nodes) initial_view_no = active_nodes[0].viewNo logger.info("Stop first node (current Primary)") _, active_nodes = stop_primary(looper, active_nodes) logger.info("Make sure view changed") expected_view_no = initial_view_no + 1 waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("send at least one checkpoint") assert nodes_do_not_have_checkpoints(*active_nodes) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=2 * checkpoint_size) assert nodes_have_checkpoints(*active_nodes) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info( "Stop second node (current Primary) so the primary looses his state") stopped_node, active_nodes = stop_primary(looper, active_nodes) logger.info("Restart the primary node") restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath) assert nodes_do_not_have_checkpoints(restarted_node) assert nodes_have_checkpoints(*active_nodes) active_nodes = active_nodes + [restarted_node] logger.info("Check that primary selected") ensureElectionsDone(looper=looper, nodes=active_nodes, numInstances=2, customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Check if the pool is able to process requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, numReqs=10 * checkpoint_size) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) assert nodes_have_checkpoints(*active_nodes)
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes)
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_client): """ Check that quorum f + 1 is used for primary selection when initiated by CurrentState messages. Assumes that view change quorum is n - f. Assumes that primaries selection in round robin fashion. """ # Ensure that we have 4 nodes in total all_nodes = list(txnPoolNodeSet) assert 4 == len(all_nodes) alpha, beta, delta, gamma = all_nodes initial_view_no = alpha.viewNo # Make one node lagging by switching it off for some time lagging_node = gamma non_lagging_nodes = [alpha, beta, delta] disconnect_node_and_ensure_disconnected(looper, all_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Make nodes to perform view change ensure_view_change(looper, non_lagging_nodes) ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, instances_list=range(2)) ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes) # Stop two more of active nodes # (but not primary, which is Beta (because of round robin selection)) stopped_nodes = [alpha] # TODO: add one more here for stopped_node in stopped_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, stopped_node, stopNode=True) looper.removeProdable(stopped_node) # Start lagging node back restarted_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath) active_nodes = [beta, delta, restarted_node] # Check that primary selected expected_view_no = initial_view_no + 1 ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) # After node catches up it set view_no from audit ledger and do not need to do view_change assert len( getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer.start_view_change, compare_val_to=True)) == 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def do_view_change_with_propagate_primary_on_one_delayed_node( slow_node, nodes, looper, sdk_pool_handle, sdk_wallet_client): slow_stasher = slow_node.nodeIbStasher fast_nodes = [n for n in nodes if n != slow_node] stashers = [n.nodeIbStasher for n in nodes] # Get last prepared certificate in pool lpc = last_prepared_certificate(nodes) # Get pool current view no view_no = lpc[0] with delay_rules(slow_stasher, icDelay()): with delay_rules(slow_stasher, vcd_delay()): with delay_rules(stashers, cDelay()): # Send request request = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) # Wait until this request is prepared on N-f nodes looper.run(eventually(check_last_prepared_certificate_on_quorum, nodes, (lpc[0], lpc[1] + 1))) # Trigger view change for n in nodes: n.view_changer.on_master_degradation() # Wait until view change is completed on all nodes except slow one waitForViewChange(looper, fast_nodes, expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, fast_nodes, getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now all the nodes receive Commits # The slow node will accept Commits and order the 3PC-batch in the old view looper.runFor(waits.expectedOrderingTime(getNoInstances(len(nodes)))) # Now slow node receives ViewChangeDones waitForViewChange(looper, [slow_node], expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, [slow_node], getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now slow node receives InstanceChanges but discards them because already # started propagate primary to the same view. # Finish request gracefully sdk_get_reply(looper, request)
def do_view_change_with_propagate_primary_on_one_delayed_node( slow_node, nodes, looper, sdk_pool_handle, sdk_wallet_client): slow_stasher = slow_node.nodeIbStasher fast_nodes = [n for n in nodes if n != slow_node] stashers = [n.nodeIbStasher for n in nodes] # Get last prepared certificate in pool lpc = last_prepared_certificate(nodes) # Get pool current view no view_no = lpc[0] with delay_rules(slow_stasher, icDelay()): with delay_rules(slow_stasher, vcd_delay()): with delay_rules(stashers, cDelay()): # Send request request = sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) # Wait until this request is prepared on N-f nodes looper.run(eventually(check_last_prepared_certificate_on_quorum, nodes, (lpc[0], lpc[1] + 1))) # Trigger view change for n in nodes: n.view_changer.on_master_degradation() # Wait until view change is completed on all nodes except slow one waitForViewChange(looper, fast_nodes, expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, fast_nodes, getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now all the nodes receive Commits # The slow node will accept Commits and order the 3PC-batch in the old view looper.runFor(waits.expectedOrderingTime(getNoInstances(len(nodes)))) # Now slow node receives ViewChangeDones waitForViewChange(looper, [slow_node], expectedViewNo=view_no + 1, customTimeout=waits.expectedPoolViewChangeStartedTimeout(len(nodes))) wait_for_elections_done_on_given_nodes(looper, [slow_node], getRequiredInstances(len(nodes)), timeout=waits.expectedPoolElectionTimeout(len(nodes))) # Now slow node receives InstanceChanges but discards them because already # started propagate primary to the same view. # Finish request gracefully sdk_get_reply(looper, request)
def test_disable_view_change(disable_view_change_config, looper, nodeSet, up, viewNo, wallet1, client1): assert disable_view_change_config assert isinstance(disable_view_change_config.unsafe, set) assert 'disable_view_change' in disable_view_change_config.unsafe simulate_slow_master(looper, nodeSet, wallet1, client1) with pytest.raises(AssertionError): waitForViewChange(looper, nodeSet, expectedViewNo=viewNo + 1)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} for n in txnPoolNodeSet: sentInstChanges[n.name] = node_sent_instance_changes_count(n) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) backup_replica = txnPoolNodeSet[0].replicas[1] backup_last_ordered_before = backup_replica.last_ordered_3pc sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) # make sure that backups also ordered at least 1 batch to be able to track performance degradation looper.run( eventually(lambda: assertExp(backup_replica.last_ordered_3pc > backup_last_ordered_before))) for n in txnPoolNodeSet: n.checkPerformance() # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert node_sent_instance_changes_count(n) > sentInstChanges.get( n.name, 0) else: assert node_sent_instance_changes_count(n) == sentInstChanges.get( n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdirWithPoolTxns, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals(old_pr_node, old_pr_node._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node._next_view_indications
def testInstChangeWithMoreReqLat(looper, setup): nodes = setup.nodes old_view_nos = set([n.viewNo for n in nodes]) assert len(old_view_nos) == 1 old_view_no = old_view_nos.pop() for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def test_quorum_after_f_plus_2_nodes_including_primary_turned_off_and_later_on( looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, wallet1, client1): nodes = txnPoolNodeSet request1 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request1]) stop_node(nodes[0], looper, nodes) waitForViewChange(looper, nodes[1:], expectedViewNo=1) ensureElectionsDone(looper, nodes[1:], numInstances=getRequiredInstances(nodeCount)) request2 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request2]) stop_node(nodes[1], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[2:], expectedViewNo=1) request3 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request3, looper, client1, nodes) stop_node(nodes[2], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[3:], expectedViewNo=1) request4 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request4, looper, client1, nodes) nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath) looper.runFor(waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[3:], expectedViewNo=1) request5 = sendRandomRequest(wallet1, client1) verify_request_not_replied_and_not_ordered(request5, looper, client1, nodes) nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes[1:], numInstances=getRequiredInstances(nodeCount)) checkViewNoForNodes(nodes[1:], expectedViewNo=1) request6 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request6]) nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes, numInstances=getRequiredInstances(nodeCount)) checkViewNoForNodes(nodes, expectedViewNo=1) request7 = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[request7])
def testInstChangeWithMoreReqLat(looper, setup): # TODO for now, view_change procedure can take more that 15 minutes # (5 minutes for catchup and 10 minutes for primary's answer). # Therefore, view_change triggering by max latency now is not indicative. nodes = setup.nodes old_view_no = setup.old_view_no for node in nodes: node.checkPerformance() assert any(getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def test_disable_view_change(disable_view_change_config, looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): assert disable_view_change_config assert isinstance(disable_view_change_config.unsafe, set) assert 'disable_view_change' in disable_view_change_config.unsafe simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward) with pytest.raises(AssertionError): waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)
def demote_another_one(rest_pool): demoted_node = rest_pool[-1] rest_pool = [n for n in rest_pool if n != demoted_node] starting_view_no = checkViewNoForNodes(rest_pool) demote_node(looper, sdk_wallet_steward, sdk_pool_handle, demoted_node) waitForViewChange(looper, rest_pool, expectedViewNo=starting_view_no + 1) ensureElectionsDone(looper, rest_pool, customTimeout=60) ensure_all_nodes_have_same_data(looper, rest_pool) return rest_pool
def test_recover_stop_primaries_no_view_change(looper, checkpoint_size, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_steward): """ Test that we can recover after having more than f nodes disconnected: - send txns - stop current master primary - restart current master primary - send txns """ active_nodes = list(txnPoolNodeSet) assert 4 == len(active_nodes) initial_view_no = active_nodes[0].viewNo logger.info("send at least one checkpoint") assert nodes_do_not_have_checkpoints(*active_nodes) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2 * checkpoint_size) assert nodes_have_checkpoints(*active_nodes) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Stop first node (current Primary)") stopped_node, active_nodes = stop_primary(looper, active_nodes) logger.info("Restart the primary node") restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath) assert nodes_do_not_have_checkpoints(restarted_node) assert nodes_have_checkpoints(*active_nodes) active_nodes = active_nodes + [restarted_node] logger.info("Check that primary selected") ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=0) ensure_all_nodes_have_same_data( looper, nodes=active_nodes, exclude_from_check=['check_last_ordered_3pc_backup']) logger.info("Check if the pool is able to process requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 10 * checkpoint_size) ensure_all_nodes_have_same_data( looper, nodes=active_nodes, exclude_from_check=['check_last_ordered_3pc_backup']) assert nodes_have_checkpoints(*active_nodes)
def test_process_three_phase_msg_and_stashed_future_view( txnPoolNodeSet, looper, tconf, sdk_pool_handle, sdk_wallet_steward): """ 1. Delay ViewChangeDone messages for the slow_node. 2. Start view change on all nodes. 3. Order a new request. 4. Check that slow_node could not order this request and stashed all 3pc messages and other nodes ordered. 6. Reset delays. 7. Check that the last request is ordered on the slow_node and stashed messages were removed. """ slow_node = txnPoolNodeSet[-1] fast_nodes = txnPoolNodeSet[:-1] view_no = slow_node.viewNo old_stashed = { inst_id: r.stasher.stash_size(STASH_VIEW_3PC) for inst_id, r in slow_node.replicas.items() } with delay_rules([ slow_node.nodeIbStasher, ], msg_rep_delay(types_to_delay=[PREPREPARE, PREPARE, COMMIT])): with delay_rules([ slow_node.nodeIbStasher, ], nv_delay()): for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, fast_nodes, expectedViewNo=view_no + 1, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) ensureElectionsDone(looper=looper, nodes=fast_nodes, instances_list=range( fast_nodes[0].requiredNumberOfInstances)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) assert slow_node.view_change_in_progress # 1 - pre-prepare msg # (len(txnPoolNodeSet) - 2) - prepare msgs # (len(txnPoolNodeSet) - 1) - commit msgs stashed_master_messages = 2 * (1 + (len(txnPoolNodeSet) - 2) + (len(txnPoolNodeSet) - 1)) assert slow_node.master_replica.stasher.stash_size( STASH_VIEW_3PC) == old_stashed[0] + stashed_master_messages def chk(): for inst_id, r in slow_node.replicas.items(): assert r.last_ordered_3pc[1] == 2 assert r.stasher.stash_size(STASH_VIEW_3PC) == 0 looper.run(eventually(chk)) waitNodeDataEquality(looper, slow_node, *fast_nodes)
def test_add_node_delay_commit_on_one(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath): view_no = txnPoolNodeSet[-1].viewNo # Add a New node but don't allow Delta to be aware of it. We do not want it in Delta's node registry. with delay_rules(txnPoolNodeSet[-1].nodeIbStasher, cDelay()): _, new_node = sdk_add_new_steward_and_node(looper, sdk_pool_handle, sdk_wallet_steward, 'New_Steward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet[:-2] + [new_node])) waitForViewChange(looper, txnPoolNodeSet, view_no + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def testInstChangeWithMoreReqLat(looper, setup): # TODO for now, view_change procedure can take more that 15 minutes # (5 minutes for catchup and 10 minutes for primary's answer). # Therefore, view_change triggering by max latency now is not indicative. nodes = setup.nodes old_view_no = setup.old_view_no for node in nodes: node.checkPerformance() assert any( getAllReturnVals(node.monitor, node.monitor.isMasterReqLatencyTooHigh)) waitForViewChange(looper, nodes, expectedViewNo=old_view_no + 1)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count( instChngMethodName) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) for n in txnPoolNodeSet: n.checkPerformance() # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert n.view_changer.spylog.count(instChngMethodName) > \ sentInstChanges.get(n.name, 0) else: assert n.view_changer.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def check_view_change_one_slow_node(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, vc_counts, slow_node_is_next_primary, delay_commit=True, delay_pre_prepare=True): current_view_no = checkViewNoForNodes(txnPoolNodeSet) expected_view_no = current_view_no + vc_counts next_primary = get_next_primary_name(txnPoolNodeSet, expected_view_no) pretenders = [ r.node for r in getNonPrimaryReplicas(txnPoolNodeSet) if not r.isPrimary ] if slow_node_is_next_primary: delayed_node = [n for n in pretenders if n.name == next_primary][0] else: delayed_node = [n for n in pretenders if n.name != next_primary][0] fast_nodes = [node for node in txnPoolNodeSet if node != delayed_node] delayers = [] if delay_pre_prepare: delayers.append(ppDelay()) delayers.append(msg_rep_delay(types_to_delay=[PREPREPARE])) if delay_commit: delayers.append(cDelay()) # delay OldViewPrePrepareReply so that slow node doesn't receive PrePrepares before ReOrdering phase finishes with delay_rules(delayed_node.nodeIbStasher, old_view_pp_reply_delay()): with delay_rules_without_processing(delayed_node.nodeIbStasher, *delayers): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) trigger_view_change(txnPoolNodeSet) if vc_counts == 2: for node in txnPoolNodeSet: node.master_replica.internal_bus.send( NodeNeedViewChange(current_view_no + 2)) waitForViewChange(looper=looper, txnPoolNodeSet=txnPoolNodeSet, expectedViewNo=expected_view_no) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30) # wait till fast nodes finish re-ordering looper.run(eventually(check_has_commits, fast_nodes)) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_recover_stop_primaries(looper, checkpoint_size, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_steward): """ Test that we can recover after having more than f nodes disconnected: - stop current master primary (Alpha) - send txns - restart current master primary (Beta) - send txns """ active_nodes = list(txnPoolNodeSet) assert 4 == len(active_nodes) initial_view_no = active_nodes[0].viewNo logger.info("Stop first node (current Primary)") _, active_nodes = stop_primary(looper, active_nodes) logger.info("Make sure view changed") expected_view_no = initial_view_no + 1 waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2)) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("send at least one checkpoint") assert nodes_do_not_have_checkpoints(*active_nodes) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2 * checkpoint_size) assert nodes_have_checkpoints(*active_nodes) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Stop second node (current Primary) so the primary looses his state") stopped_node, active_nodes = stop_primary(looper, active_nodes) logger.info("Restart the primary node") restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath) assert nodes_do_not_have_checkpoints(restarted_node) assert nodes_have_checkpoints(*active_nodes) active_nodes = active_nodes + [restarted_node] logger.info("Check that primary selected") ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Check if the pool is able to process requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 10 * checkpoint_size) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) assert nodes_have_checkpoints(*active_nodes)
def test_view_change_min_catchup_timeout(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tconf, viewNo): """ One of the conditions to finish catch-up during view change is to have MAX_CATCHUPS_DONE_DURING_VIEW_CHANGE rounds of catch-up without any new transactions caught up. But this should not finish very quickly. So, we should try to catch-up until MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE. In the test: - Before starting view change, mock `has_ordered_till_last_prepared_certificate` so that it always returns False. - This means that the only condition on how we can finish catch-up is by MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE timeout and having more than MAX_CATCHUPS_DONE_DURING_VIEW_CHANGE rounds of catch-up without new txns caught up. - Check that view change is not finished until MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE - Check that view change is eventually finished after MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE """ # 1. Send some txns sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 4) # 2. make the only condition to finish catch-up is # MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE patch_has_ordered_till_last_prepared_certificate(txnPoolNodeSet) # 3. start view change expected_view_no = viewNo + 1 for node in txnPoolNodeSet: node.view_changer.startViewChange(expected_view_no) # 4. check that it's not finished till # MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE no_view_chanage_timeout = tconf.MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE - 1 with pytest.raises(EventuallyTimeoutException): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=no_view_chanage_timeout) # 5. make sure that view change is finished eventually # (it should be finished quite soon after we waited for MIN_TIMEOUT_CATCHUPS_DONE_DURING_VIEW_CHANGE) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=2) waitForViewChange(looper=looper, txnPoolNodeSet=txnPoolNodeSet, expectedViewNo=expected_view_no) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # 6. ensure that the pool is still functional. sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_client): """ Check that quorum f + 1 is used for primary selection when initiated by CurrentState messages. Assumes that view change quorum is n - f. Assumes that primaries selection in round robin fashion. """ # Ensure that we have 4 nodes in total all_nodes = list(txnPoolNodeSet) assert 4 == len(all_nodes) alpha, beta, delta, gamma = all_nodes initial_view_no = alpha.viewNo # Make one node lagging by switching it off for some time lagging_node = gamma non_lagging_nodes = [alpha, beta, delta] disconnect_node_and_ensure_disconnected(looper, all_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Make nodes to perform view change ensure_view_change(looper, non_lagging_nodes) ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, instances_list=range(2)) ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes) # Stop two more of active nodes # (but not primary, which is Beta (because of round robin selection)) stopped_nodes = [alpha] # TODO: add one more here for stopped_node in stopped_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, stopped_node, stopNode=True) looper.removeProdable(stopped_node) # Start lagging node back restarted_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath) active_nodes = [beta, delta, restarted_node] # Check that primary selected expected_view_no = initial_view_no + 1 ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_restart_to_same_view_with_killed_primary(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, sdk_pool_handle, sdk_wallet_client): restart_timeout = tconf.ToleratePrimaryDisconnection + \ waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) primary = txnPoolNodeSet[0] alive_nodes = txnPoolNodeSet[1:] minority = alive_nodes[-1:] majority = alive_nodes[:-1] # Move to higher view by killing primary primary.cleanupOnStopping = True primary.stop() looper.removeProdable(primary) ensure_node_disconnected(looper, primary, txnPoolNodeSet) waitForViewChange(looper, alive_nodes, 1, customTimeout=VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, alive_nodes, instances_list=range(3)) # Add transaction to ledger sdk_send_random_and_check(looper, alive_nodes, sdk_pool_handle, sdk_wallet_client, 1) # Restart majority group majority_before_restart = majority.copy() restart_nodes(looper, alive_nodes, majority, tconf, tdir, allPluginsPath, after_restart_timeout=restart_timeout, start_one_by_one=False, wait_for_elections=False) waitForViewChange(looper, majority, 1, customTimeout=2.1 * VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, majority, instances_list=range(3)) # Check that nodes in minority group are aware that they might have inconsistent 3PC state for node in minority: assert node.spylog.count(node.on_inconsistent_3pc_state) == 1 # Check that nodes in majority group didn't think they might have inconsistent 3PC state for node in majority_before_restart: assert node.spylog.count(node.on_inconsistent_3pc_state) == 0 # Check that nodes in majority group don't think they might have inconsistent 3PC state for node in majority: assert node.spylog.count(node.on_inconsistent_3pc_state) == 0 # Restart minority group restart_nodes(looper, alive_nodes, minority, tconf, tdir, allPluginsPath, after_restart_timeout=restart_timeout, start_one_by_one=False, wait_for_elections=False) ensureElectionsDone(looper, alive_nodes, instances_list=range(3)) # Check that all nodes are still functional sdk_ensure_pool_functional(looper, alive_nodes, sdk_wallet_client, sdk_pool_handle)
def test_restart_primaries_then_demote( looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, sdk_pool_handle, sdk_wallet_stewards): """ """ sdk_wallet_steward = sdk_wallet_stewards[0] logger.info("1. Restart Node1") pool_of_nodes = ensure_view_change_by_primary_restart(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle, sdk_wallet_steward, 1) logger.info("2. Restart Node2") pool_of_nodes = ensure_view_change_by_primary_restart(looper, pool_of_nodes, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle, sdk_wallet_steward, 1) logger.info("3. Demote Node3") # demote the node pool_of_nodes = demote_primary_node(looper, txnPoolNodeSet, pool_of_nodes, sdk_pool_handle, sdk_wallet_stewards) # make sure view changed waitForViewChange(looper, pool_of_nodes, expectedViewNo=3) # ensure pool is working properly sdk_send_random_and_check(looper, pool_of_nodes, sdk_pool_handle, sdk_wallet_steward, 10) ensure_all_nodes_have_same_data(looper, nodes=pool_of_nodes)
def test_view_change_on_quorum_of_master_degraded(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, viewNo): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's performance falls and view changes delayNonPrimaries(txnPoolNodeSet, 0, 10) pr = getPrimaryReplica(txnPoolNodeSet, 0) relucatantNode = pr.node # Count sent instance changes of all nodes sentInstChanges = {} instChngMethodName = ViewChanger.sendInstanceChange.__name__ for n in txnPoolNodeSet: sentInstChanges[n.name] = n.view_changer.spylog.count(instChngMethodName) # Node reluctant to change view, never says master is degraded relucatantNode.monitor.isMasterDegraded = types.MethodType( lambda x: False, relucatantNode.monitor) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 4) # Check that view change happened for all nodes waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) # All nodes except the reluctant node should have sent a view change and # thus must have called `sendInstanceChange` for n in txnPoolNodeSet: if n.name != relucatantNode.name: assert n.view_changer.spylog.count(instChngMethodName) > \ sentInstChanges.get(n.name, 0) else: assert n.view_changer.spylog.count(instChngMethodName) == \ sentInstChanges.get(n.name, 0) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) assert len(getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_replica_removing_with_primary_disconnected(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. Remove backup primary node. 2. Check that replicas with the disconnected primary were removed. 3. Recover the removed node. 4. Start View Change. 5. Check that all replicas were restored. """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas instance_to_remove = 1 node = txnPoolNodeSet[instance_to_remove] # remove backup primary node. disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node) txnPoolNodeSet.remove(node) looper.removeProdable(node) # check that replicas were removed def check_replica_removed_on_all_nodes(): for node in txnPoolNodeSet: check_replica_removed(node, start_replicas_count, instance_to_remove) looper.run(eventually(check_replica_removed_on_all_nodes, timeout=tconf.TolerateBackupPrimaryDisconnection * 4)) assert not node.monitor.isMasterDegraded() assert len(node.requests) == 0 # recover the removed node node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert start_replicas_count == node.replicas.num_replicas
def test_disable_view_change( disable_view_change_config, looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): assert disable_view_change_config assert isinstance(disable_view_change_config.unsafe, set) assert 'disable_view_change' in disable_view_change_config.unsafe simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward) with pytest.raises(AssertionError): waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1)
def test_view_change_on_performance_degraded(looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): """ Test that a view change is done when the performance of master goes down Send multiple requests from the client and delay some requests by master instance so that there is a view change. All nodes will agree that master performance degraded """ old_primary_node = get_master_primary_node(list(txnPoolNodeSet)) simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert old_primary_node.name != new_primary_node.name
def do_test_replica_removing_with_backup_degraded(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf): """ Node will change view even though it does not find the master to be degraded when a quorum of nodes agree that master performance degraded """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas view_no = txnPoolNodeSet[0].viewNo instance_to_remove = 1 stashers = [node.nodeIbStasher for node in txnPoolNodeSet] with delay_rules(stashers, cDelay(delay=sys.maxsize, instId=instance_to_remove)): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=10, num_batches=5) # check that replicas were removed def check_replica_removed_on_all_nodes(inst_id=instance_to_remove): for n in txnPoolNodeSet: check_replica_removed(n, start_replicas_count, inst_id) assert not n.monitor.isMasterDegraded() looper.run(eventually(check_replica_removed_on_all_nodes, timeout=120)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=view_no + 1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert all(start_replicas_count == node.replicas.num_replicas for node in txnPoolNodeSet)
def testDiscardInstChngMsgFrmPastView(txnPoolNodeSet, looper, ensureView): """ Once a view change is done, any further INSTANCE_CHANGE messages for that view must be discarded by the node. """ curViewNo = ensureView # Send an instance change for an old instance message to all nodes icMsg = txnPoolNodeSet[0].view_changer._create_instance_change_msg(curViewNo, 0) txnPoolNodeSet[0].send(icMsg) # ensure every node but Alpha discards the invalid instance change request timeout = waits.expectedPoolViewChangeStartedTimeout(len(txnPoolNodeSet)) # Check that that message is discarded. looper.run(eventually(checkDiscardMsg, txnPoolNodeSet, icMsg, 'which is not more than its view no', txnPoolNodeSet[0], timeout=timeout)) waitForViewChange(looper, txnPoolNodeSet)
def test_view_change_timeout_reset_on_next_view(txnPoolNodeSet, looper, tconf): # Check that all nodes are in view 0 assert all(n.viewNo == 0 for n in txnPoolNodeSet) stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, vcd_delay()): # Start first view change for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1) looper.runFor(0.6 * VIEW_CHANGE_TIMEOUT) # Start second view change for n in txnPoolNodeSet: n.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=2) looper.runFor(0.6 * VIEW_CHANGE_TIMEOUT) # Ensure only 2 view changes happened ensureElectionsDone(looper, txnPoolNodeSet) for n in txnPoolNodeSet: assert n.viewNo == 2
def test_delayed_instance_changes_after_vcd_for_next_view(looper, txnPoolNodeSet): ''' A node is doing view change to view=1, while the other nodes already finished view change to view=2. The node receives a quorum of VCD messages for view=2 before a quorum of InstanceChange messages for view=2. Nevertheless, the node should not start a view change to view=2 without a quorum of InstanceChanges, that is it should not go to propagate primary mode since it's already in view chanage state. The node should eventually finish view change to view=2 once receives all VCD and IS msgs for view=2 ''' nodes = txnPoolNodeSet slow_node = nodes[-1] fast_nodes = [n for n in nodes if n != slow_node] slow_stasher = slow_node.nodeIbStasher # 1. DO FIRST VIEW CHANGE # delay VCD for the first ViewChange with delay_rules(slow_stasher, vcd_delay()): # Trigger view change for n in nodes: n.view_changer.on_master_degradation() waitForViewChange(looper, nodes, expectedViewNo=1) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3)) # drop all VCD to view=1 slow_stasher.drop_delayeds() # 2. DO SECOND VIEW CHANGE # delay Instance Changes and # so that the slow node receives VCD for view=2 before # a quorum of InstanceChanges for that view while still doing view change to view=1 with delay_rules(slow_stasher, icDelay()): # Trigger view change for n in nodes: n.view_changer.on_master_degradation() waitForViewChange(looper, fast_nodes, expectedViewNo=2) # make sure view change is finished on all nodes except the slow one ensureElectionsDone(looper, fast_nodes, instances_list=range(3)) # slow node is still on view=1 assert slow_node.viewNo == 1 assert slow_node.view_change_in_progress # make sure that the slow node receives VCD msgs for view=2 # and didn't receive IS msgs for view=2 check_vcd_msgs(slow_node, expected_view_no=2, expected_count=len(fast_nodes), ) check_no_ic_msgs(slow_node, expected_view_no=2) # 3. RESET DELAYS AND CHECK waitForViewChange(looper, nodes, expectedViewNo=2) ensureElectionsDone(looper, nodes) assert not slow_node.view_change_in_progress ensure_all_nodes_have_same_data(looper, nodes=nodes)
def setup(txnPoolNodeSet, looper): m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) initial_view_no = waitForViewChange(looper, txnPoolNodeSet) timeout_callback_stats = _check_view_change_completed_stats(txnPoolNodeSet) return m_primary_node, initial_view_no, timeout_callback_stats
def test_replica_removing_after_view_change(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. Remove backup primary node. 2. Check that replicas with the disconnected primary were removed. 3. Start View Change. 4. Check that the new replica with disconnected primary is removed and other replicas are working correctly. 5. Recover the removed node. 6. Start View Change. 7. Check that all replicas were restored. """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas instance_to_remove = txnPoolNodeSet[0].requiredNumberOfInstances - 1 removed_node = txnPoolNodeSet[instance_to_remove] # remove backup primary node. disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, removed_node) txnPoolNodeSet.remove(removed_node) looper.removeProdable(removed_node) # check that replicas were removed def check_replica_removed_on_all_nodes(inst_id=instance_to_remove): for n in txnPoolNodeSet: check_replica_removed(n, start_replicas_count, inst_id) assert not n.monitor.isMasterDegraded() assert len(n.requests) == 0 looper.run(eventually(check_replica_removed_on_all_nodes, timeout=tconf.TolerateBackupPrimaryDisconnection * 2)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) instance_to_remove -= 1 instances = list(range(txnPoolNodeSet[0].requiredNumberOfInstances)) instances.remove(instance_to_remove) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, instances_list=instances, customTimeout=tconf.TolerateBackupPrimaryDisconnection * 4) # check that all replicas were restored looper.run(eventually(check_replica_removed_on_all_nodes, instance_to_remove, timeout=tconf.TolerateBackupPrimaryDisconnection * 2)) # recover the removed node removed_node = start_stopped_node(removed_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(removed_node) looper.run(checkNodesConnected(txnPoolNodeSet)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, instances_list=range(txnPoolNodeSet[0].requiredNumberOfInstances), customTimeout=tconf.TolerateBackupPrimaryDisconnection * 2) assert start_replicas_count == removed_node.replicas.num_replicas
def test_quorum_after_f_plus_2_nodes_including_primary_turned_off_and_later_on( looper, allPluginsPath, tdir, tconf, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): timeout = sdk_eval_timeout(1, len(txnPoolNodeSet)) nodes = txnPoolNodeSet sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stop_node(nodes[0], looper, nodes) waitForViewChange(looper, nodes[1:], expectedViewNo=1) ensureElectionsDone(looper, nodes[1:], instances_list=range(getRequiredInstances(nodeCount))) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) stop_node(nodes[1], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[2:], expectedViewNo=1) sdk_reqs3 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): req_res = sdk_get_replies(looper, sdk_reqs3, timeout=timeout) sdk_check_reply(req_res[0]) stop_node(nodes[2], looper, nodes) looper.runFor(tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[3:], expectedViewNo=1) sdk_reqs4 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): req_res = sdk_get_replies(looper, sdk_reqs4, timeout=timeout) sdk_check_reply(req_res[0]) nodes[2] = start_stopped_node(nodes[2], looper, tconf, tdir, allPluginsPath) looper.runFor(waits.expectedPoolElectionTimeout(len(nodes))) checkViewNoForNodes(nodes[3:], expectedViewNo=1) sdk_reqs5 = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): req_res = sdk_get_replies(looper, sdk_reqs5, timeout=timeout) sdk_check_reply(req_res[0]) nodes[1] = start_stopped_node(nodes[1], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes[1:], instances_list=range(getRequiredInstances(nodeCount)), customTimeout=60) checkViewNoForNodes(nodes[1:], expectedViewNo=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) nodes[0] = start_stopped_node(nodes[0], looper, tconf, tdir, allPluginsPath) ensureElectionsDone(looper, nodes, instances_list=range(getRequiredInstances(nodeCount)), customTimeout=60) checkViewNoForNodes(nodes, expectedViewNo=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Verifies that a disconnected node with a lagged view accepts the current view from the other nodes on re-connection. Steps: 1. Provoke view change to 1. 2. Ensure that all the nodes complete view change to 1. 3. Disconnect one node from the rest of the nodes in the pool. 4. Provoke view change to 2. 5. Ensure that that all the nodes except for the disconnected one complete view change to 2 and the disconnected node remains in the view 1. 6. Provoke view change to 3. 5. Ensure that that all the nodes except for the disconnected one complete view change to 3 and the disconnected node remains in the view 1. 8. Connect the disconnected node to the rest of the nodes in the pool. 9. Ensure that the re-connected node completes view change to 3. 10. Ensure that all the nodes participate in consensus. """ checkViewNoForNodes(txnPoolNodeSet, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagged_node, stopNode=False) other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 2) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 3) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node) waitForViewChange(looper, [lagged_node], 3, customTimeout=waits.expectedPoolElectionTimeout( len(txnPoolNodeSet))) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 3) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_view_change_after_back_to_quorum_with_disconnected_primary(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): assert len(txnPoolNodeSet) == 4 pr_node = get_master_primary_node(txnPoolNodeSet) assert pr_node.name == "Alpha" # 1. Initiate view change be primary (Alpha) restart nodes = ensure_view_change_by_primary_restart(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # Now primary should be Beta pr_node = get_master_primary_node(nodes) assert pr_node.name == "Beta" # 2. Stop non-primary node Delta, no any view changes are expected non_primary_to_stop = [n for n in nodes if n.name == "Delta"][0] disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, non_primary_to_stop) looper.removeProdable(non_primary_to_stop) remaining_nodes = list(set(nodes) - {non_primary_to_stop}) # Primary is going to be stopped, remember instance change messages count # to ensure that no view change happened as number of connected nodes is less # than quorum. ic_cnt = {} for n in remaining_nodes: ic_cnt[n.name] = n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__) # 3. Disconnect primary disconnect_node_and_ensure_disconnected( looper, remaining_nodes, pr_node) looper.removeProdable(pr_node) # Wait for more than ToleratePrimaryDisconnection timeout and check that no IC messages presented. looper.runFor(tconf.ToleratePrimaryDisconnection + 5) remaining_nodes = list(set(remaining_nodes) - {pr_node}) for n in remaining_nodes: assert ic_cnt[n.name] == n.view_changer.spylog.count(ViewChanger.sendInstanceChange.__name__) view_no = checkViewNoForNodes(remaining_nodes) # 4. Start Delta (non-primary), now primary (Beta) is disconnected but there is a quorum # to choose a new one. restartedNode = start_stopped_node(non_primary_to_stop, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) remaining_nodes = remaining_nodes + [restartedNode] # 5. Check that view change happened. waitForViewChange(looper, remaining_nodes, expectedViewNo=(view_no + 1), customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) # ensure pool is working properly sdk_send_random_and_check(looper, remaining_nodes, sdk_pool_handle, sdk_wallet_client, 3) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes)
def ensureView(txnPoolNodeSet, looper): """ Ensure that all the nodes in the txnPoolNodeSet are in the same view. """ return waitForViewChange(looper, txnPoolNodeSet)