def restart_nodes(looper, nodeSet, restart_set, tconf, tdir, allPluginsPath, after_restart_timeout=None, start_one_by_one=True, wait_for_elections=True): for node_to_stop in restart_set: node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) rest_nodes = [n for n in nodeSet if n not in restart_set] for node_to_stop in restart_set: ensure_node_disconnected(looper, node_to_stop, nodeSet, timeout=2) if after_restart_timeout: looper.runFor(after_restart_timeout) for node_to_restart in restart_set.copy(): config_helper = PNodeConfigHelper(node_to_restart.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_restart.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_restart.nodestack.ha, cliha=node_to_restart.clientstack.ha) looper.add(restarted_node) idx = nodeSet.index(node_to_restart) nodeSet[idx] = restarted_node idx = restart_set.index(node_to_restart) restart_set[idx] = restarted_node rest_nodes += [restarted_node] if start_one_by_one: looper.run(checkNodesConnected(rest_nodes)) if not start_one_by_one: looper.run(checkNodesConnected(nodeSet)) if wait_for_elections: ensureElectionsDone(looper=looper, nodes=nodeSet)
def test_restart_majority_to_same_view(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath, sdk_pool_handle, sdk_wallet_client): # Add transaction to ledger sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) majority = txnPoolNodeSet[:3] minority = txnPoolNodeSet[3:] # Restart majority group tm = tconf.ToleratePrimaryDisconnection + waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) majority_before_restart = majority.copy() restart_nodes(looper, txnPoolNodeSet, majority, tconf, tdir, allPluginsPath, after_restart_timeout=tm, start_one_by_one=False, wait_for_elections=False) ensureElectionsDone(looper, majority, instances_list=range(2)) # Check that nodes in minority group are aware that they might have inconsistent 3PC state for node in minority: assert node.spylog.count(node.on_inconsistent_3pc_state) == 1 # Check that nodes in majority group didn't think they might have inconsistent 3PC state for node in majority_before_restart: assert node.spylog.count(node.on_inconsistent_3pc_state) == 0 # Check that nodes in majority group don't think they might have inconsistent 3PC state for node in majority: assert node.spylog.count(node.on_inconsistent_3pc_state) == 0 # Restart minority group restart_nodes(looper, txnPoolNodeSet, minority, tconf, tdir, allPluginsPath, after_restart_timeout=tm, start_one_by_one=False) # Check that all nodes are still functional sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_view_change_with_different_prepare_certificate(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Check that a node without pre-prepare but with quorum of prepares wouldn't use this transaction as a last in prepare certificate """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) slow_node = txnPoolNodeSet[-1] # delay preprepares and message response with preprepares. with delay_rules(slow_node.nodeIbStasher, ppDelay(delay=sys.maxsize)): with delay_rules(slow_node.nodeIbStasher, msg_rep_delay(delay=sys.maxsize, types_to_delay=[PREPREPARE, ])): last_ordered = slow_node.master_replica.last_ordered_3pc sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) looper.run(eventually(check_prepare_certificate, txnPoolNodeSet[0:-1], last_ordered[1] + 1)) for n in txnPoolNodeSet: n.view_changer.on_master_degradation() assert slow_node.master_replica.last_prepared_certificate_in_view() == \ (0, last_ordered[1]) ensureElectionsDone(looper, txnPoolNodeSet)
def testPrimarySelectionAfterViewChange( # noqa looper, txnPoolNodeSet, primaryReplicas, catchup_complete_count): """ Test that primary replica of a protocol instance shifts to a new node after a view change. """ # TODO: This test can fail due to view change. ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) for n in txnPoolNodeSet: assert n.spylog.count( n.allLedgersCaughtUp) > catchup_complete_count[n.name] # Primary replicas before view change prBeforeVC = primaryReplicas # Primary replicas after view change instanceCount = getNoInstances(nodeCount) prAfterVC = [getPrimaryReplica(txnPoolNodeSet, i) for i in range(instanceCount)] # Primary replicas have moved to the next node for br, ar in zip(prBeforeVC, prAfterVC): assert ar.node.rank - br.node.rank == 1 check_rank_consistent_across_each_node(txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1)
def test_not_set_H_as_maxsize_for_backup_if_is_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) primary_on_backup = txnPoolNodeSet[2] assert primary_on_backup.replicas._replicas[1].isPrimary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_on_backup, stopNode=True) looper.removeProdable(primary_on_backup) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, LOG_SIZE) restarted_node = start_stopped_node(primary_on_backup, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[2] = restarted_node ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.VIEW_CHANGE_TIMEOUT) assert restarted_node.replicas._replicas[1].isPrimary assert restarted_node.replicas._replicas[1].h == 0 assert restarted_node.replicas._replicas[1].H == LOG_SIZE
def testPrimaryElectionCase4(case4Setup, looper): """ Case 4 - A node making multiple primary declarations for a particular node. Consider 4 nodes A, B, C and D. Lets say node B is malicious and is repeatedly declaring Node D as primary """ allNodes = case4Setup A, B, C, D = allNodes looper.run(checkNodesConnected(allNodes)) # Node B sends multiple declarations of node D's 0th protocol instance as # primary to all nodes for i in range(5): B.send(Primary(D.name, 0, B.viewNo)) # No node from node A, node C, node D(node B is malicious anyway so not # considering it) should have more than one primary declaration for node # D since node D is slow. The one primary declaration for node D, # that nodes A, C and D might have would be because of node B def x(): primDecs = list(node.elector.primaryDeclarations[0].values()) assert primDecs.count(D.name) <= 1 for node in (A, C, D): looper.run(eventually(x, retryWait=.5, timeout=2)) ensureElectionsDone(looper=looper, nodes=allNodes, retryWait=1, timeout=45) # Node D should not have any primary replica assert not D.hasPrimary
def testNodeDoesNotParticipateUntilCaughtUp(txnPoolNodeSet, nodes_slow_to_process_catchup_reqs, sdk_node_created_after_some_txns): """ A new node that joins after some transactions should stash new transactions until it has caught up :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_created_after_some_txns txnPoolNodeSet.append(new_node) old_nodes = txnPoolNodeSet[:-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) chk_commits_prepares_recvd(0, old_nodes, new_node) for node in old_nodes: node.reset_delays_and_process_delayeds() timeout = waits.expectedPoolCatchupTime(len(txnPoolNodeSet)) + \ catchup_delay + \ waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout) waitNodeDataEquality(looper, new_node, *old_nodes) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 2) # Commits and Prepares are received by all old nodes with pytest.raises(AssertionError): # Since nodes discard 3PC messages for already ordered requests. chk_commits_prepares_recvd(0, old_nodes, new_node) waitNodeDataEquality(looper, new_node, *old_nodes)
def test_no_view_change_until_synced(txnPoolNodeSet, looper, mode): # emulate catchup by setting non-synced status for node in txnPoolNodeSet: node.mode = mode check_instance_change_count(txnPoolNodeSet, 0) # start View Change old_view_no = checkViewNoForNodes(txnPoolNodeSet) old_meths = do_view_change(txnPoolNodeSet) for node in txnPoolNodeSet: node.view_changer.sendInstanceChange(old_view_no + 1) # make sure View Change is not started check_no_view_change(looper, txnPoolNodeSet) assert old_view_no == checkViewNoForNodes(txnPoolNodeSet) # emulate finishing of catchup by setting Participating status revert_do_view_change(txnPoolNodeSet, old_meths) for node in txnPoolNodeSet: node.mode = Mode.participating # make sure that View Change happened waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=old_view_no + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_multiple_view_change_retries_by_timeouts( txnPoolNodeSet, looper, tconf, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted each time when the previous one is timed out """ _, initial_view_no, timeout_callback_stats = setup stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, vcd_delay()): start_view_change(txnPoolNodeSet, initial_view_no + 1) # Wait until timeout callback is called 3 times looper.run(eventually(check_watchdog_called_expected_times, txnPoolNodeSet, timeout_callback_stats, 3, retryWait=1, timeout=3 * VIEW_CHANGE_TIMEOUT + 2)) # View changes should fail with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=1) # This view change must be completed with no problems ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) # 4 view changes must have been initiated (initial one + 3 retries) for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 4 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_old_instance_change_discarding(txnPoolNodeSet, looper, tconf): view_no = txnPoolNodeSet[0].viewNo first_nodes = txnPoolNodeSet[:2] second_nodes = txnPoolNodeSet[2:] for node in first_nodes: node.view_changer.on_master_degradation() def chk_ic_discard(): for n in txnPoolNodeSet: assert not n.view_changer.instanceChanges.has_view(view_no + 1) for frm in first_nodes: assert not n.view_changer.instanceChanges.has_inst_chng_from(view_no + 1, frm.name) looper.run(eventually(chk_ic_discard, timeout=tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL + 10)) for node in second_nodes: node.view_changer.on_master_degradation() ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) for node in txnPoolNodeSet: assert node.viewNo == view_no
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas
def test_view_change_retry_by_timeout( txnPoolNodeSet, looper, tconf, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup stashers = [n.nodeIbStasher for n in txnPoolNodeSet] with delay_rules(stashers, vcd_delay()): start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=1.5 * VIEW_CHANGE_TIMEOUT) # Now as ViewChangeDone messages are unblocked view changes should finish successfully ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time check_watchdog_called_expected_times(txnPoolNodeSet, timeout_callback_stats, 1) # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_view_change_after_some_txns(txnPoolNodesLooper, txnPoolNodeSet, some_txns_done, testNodeClass, viewNo, # noqa sdk_pool_handle, sdk_wallet_client, node_config_helper_class, tconf, tdir, allPluginsPath, tmpdir_factory): """ Check that view change is done after processing some of txns """ ensure_view_change(txnPoolNodesLooper, txnPoolNodeSet) ensureElectionsDone(looper=txnPoolNodesLooper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(txnPoolNodesLooper, nodes=txnPoolNodeSet) sdk_send_random_and_check(txnPoolNodesLooper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) ensure_all_nodes_have_same_data(txnPoolNodesLooper, txnPoolNodeSet) for node in txnPoolNodeSet: txnPoolNodesLooper.removeProdable(node) node.stop() config = getConfigOnce() reload_modules_for_replay(tconf) replayable_node_class, basedirpath = get_replayable_node_class( tmpdir_factory, tdir, testNodeClass, config) print('-------------Replaying now---------------------') for node in txnPoolNodeSet: create_replayable_node_and_check(txnPoolNodesLooper, txnPoolNodeSet, node, replayable_node_class, node_config_helper_class, tconf, basedirpath, allPluginsPath)
def test_restarted_node_complete_vc_by_current_state(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath): node_to_restart = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart, stopNode=True) looper.removeProdable(node_to_restart) old_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) ensure_view_change(looper, txnPoolNodeSet[:-1]) ensureElectionsDone(looper, txnPoolNodeSet[:-1], customTimeout=tconf.VIEW_CHANGE_TIMEOUT) current_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) assert current_completed_view_no > old_completed_view_no # Delay VIEW_CHANGE_DONE messages for all nodes for node in txnPoolNodeSet[:-1]: node.nodeIbStasher.delay(vcd_delay(1000)) ensure_view_change(looper, txnPoolNodeSet[:-1]) # Start stopped node until other nodes do view_change node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) node_to_restart.nodeIbStasher.delay(vcd_delay(1000)) # check, that restarted node use last completed view no from pool, instead of proposed looper.run(eventually(complete_propagate_primary, node_to_restart, current_completed_view_no, timeout=tconf.VIEW_CHANGE_TIMEOUT))
def testPrimaryElectionCase2(case2Setup, looper, keySharedNodes): """ Case 2 - A node making nominations for a multiple other nodes. Consider 4 nodes A, B, C, and D. Lets say node B is malicious and nominates node C to all nodes. Again node B nominates node D to all nodes. """ nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() looper.run(checkNodesConnected(nodeSet)) # Node B sends multiple NOMINATE msgs but only after A has nominated itself looper.run(eventually(checkNomination, A, A.name, retryWait=.25, timeout=1)) instId = getSelfNominationByNode(A) BRep = Replica.generateName(B.name, instId) CRep = Replica.generateName(C.name, instId) DRep = Replica.generateName(D.name, instId) # Node B first sends NOMINATE msgs for Node C to all nodes B.send(Nomination(CRep, instId, B.viewNo)) # Node B sends NOMINATE msgs for Node D to all nodes B.send(Nomination(DRep, instId, B.viewNo)) # Ensure elections are done ensureElectionsDone(looper=looper, nodes=nodeSet, retryWait=1, timeout=45) # All nodes from node A, node C, node D(node B is malicious anyway so # not considering it) should have nomination for node C from node B since # node B first nominated node C for node in [A, C, D]: assert node.elector.nominations[instId][BRep] == CRep
def testPrimaryElectionCase5(case5Setup, looper, keySharedNodes): """ Case 5 - A node making primary declarations for a multiple other nodes. Consider 4 nodes A, B, C, and D. Lets say node B is malicious and declares node C as primary to all nodes. Again node B declares node D as primary to all nodes. """ nodeSet = keySharedNodes A, B, C, D = nodeSet.nodes.values() looper.run(checkNodesConnected(nodeSet)) BRep = Replica.generateName(B.name, 0) CRep = Replica.generateName(C.name, 0) DRep = Replica.generateName(D.name, 0) # Node B first sends PRIMARY msgs for Node C to all nodes B.send(Primary(CRep, 0, B.viewNo)) # Node B sends PRIMARY msgs for Node D to all nodes B.send(Primary(DRep, 0, B.viewNo)) # Ensure elections are done ensureElectionsDone(looper=looper, nodes=nodeSet, retryWait=1, timeout=45) # All nodes from node A, node C, node D(node B is malicious anyway so not # considering it) should have primary declarations for node C from node B # since node B first nominated node C for node in [A, C, D]: logger.debug("node {} should have primary declaration for C from node B".format(node)) assert node.elector.primaryDeclarations[0][BRep] == CRep
def test_order_after_demote_and_restart(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath, sdk_wallet_stewards): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) primary_node = txnPoolNodeSet[0] node_to_stop = txnPoolNodeSet[1] node_to_demote = txnPoolNodeSet[2] txnPoolNodeSet.remove(node_to_demote) node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) ensure_node_disconnected(looper, node_to_stop, txnPoolNodeSet, timeout=2) demote_node(looper, sdk_wallet_stewards[2], sdk_pool_handle, node_to_demote) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_stop.nodestack.ha, cliha=node_to_stop.clientstack.ha) looper.add(restarted_node) txnPoolNodeSet[1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1, 1) def get_current_bls_keys(node): return node.master_replica._bls_bft_replica._bls_bft.bls_key_register._current_bls_keys assert get_current_bls_keys(restarted_node) == get_current_bls_keys(primary_node)
def test_no_propagated_future_view_change_until_synced(txnPoolNodeSet, looper, mode): # the last node is a lagging one, which will receive ViewChangeDone messages for future view viewNo = checkViewNoForNodes(txnPoolNodeSet) lagged_node_index = (viewNo + 3) % len(txnPoolNodeSet) lagged_node = txnPoolNodeSet[lagged_node_index] other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) # emulate catchup by setting non-synced status lagged_node.mode = mode old_view_no = checkViewNoForNodes([lagged_node]) check_future_vcd_count(lagged_node, 0) # delay INSTANCE CHANGE on lagged nodes, so all nodes except the lagging one finish View Change with delay_rules(lagged_node.nodeIbStasher, icDelay()): # make sure that View Change happened on all nodes but the lagging one ensure_view_change(looper, other_nodes) checkProtocolInstanceSetup(looper=looper, nodes=other_nodes, instances=range(2)) ensure_all_nodes_have_same_data(looper, nodes=other_nodes) check_no_view_change(looper, lagged_node) assert old_view_no == checkViewNoForNodes([lagged_node]) # emulate finishing of catchup by setting Participating status lagged_node.mode = Mode.participating # make sure that View Change happened on lagging node waitForViewChange(looper, [lagged_node], expectedViewNo=old_view_no + 1, customTimeout=10) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet)
def test_view_change_on_empty_ledger(txnPoolNodeSet, looper): """ Check that view change is done when no txns in the ldegr """ ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet)
def test_vc_by_current_state(txnPoolNodeSet, looper, tdir, tconf, allPluginsPath): node_to_stop = txnPoolNodeSet[-1] old_view_no = node_to_stop.view_changer.last_completed_view_no disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) ensure_view_change(looper, txnPoolNodeSet[:-1]) ensureElectionsDone(looper, txnPoolNodeSet[:-1], customTimeout=tconf.VIEW_CHANGE_TIMEOUT) new_view_no = txnPoolNodeSet[0].view_changer.last_completed_view_no assert new_view_no > old_view_no node_to_stop = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_stop ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.VIEW_CHANGE_TIMEOUT) assert node_to_stop.view_changer.last_completed_view_no == new_view_no
def test_view_change_with_different_ic(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. panic_node (Delta) send InstanceChange for all nodes. 2. Restart nodes_to_restart (Beta, Gamma). 3. nodes_to_restart send InstanceChanges for all nodes. 4. Ensure elections done. """ nodes_to_restart = txnPoolNodeSet[1:3] panic_node = txnPoolNodeSet[-1] view_no = txnPoolNodeSet[0].viewNo panic_node.view_changer.on_master_degradation() for n in nodes_to_restart: _restart_node(looper, txnPoolNodeSet, n, tconf, tdir, allPluginsPath) nodes_to_restart = txnPoolNodeSet[1:3] for n in nodes_to_restart: n.view_changer.on_master_degradation() def check(): assert panic_node.view_change_in_progress looper.run(eventually(check)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) for node in txnPoolNodeSet: assert node.viewNo > view_no
def test_all_replicas_hold_request_keys( perf_chk_patched, looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ All replicas whether primary or non primary hold request keys of forwarded requests. Once requests are ordered, they request keys are removed from replica. """ tconf = perf_chk_patched delay_3pc = 2 delay_3pc_messages(txnPoolNodeSet, 0, delay_3pc) delay_3pc_messages(txnPoolNodeSet, 1, delay_3pc) def chk(count): # All replicas have same amount of forwarded request keys and all keys # are finalised. for node in txnPoolNodeSet: for r in node.replicas.values(): if r.isPrimary is False: assert len(r.requestQueues[DOMAIN_LEDGER_ID]) == count for i in range(count): k = r.requestQueues[DOMAIN_LEDGER_ID][i] assert r.requests[k].finalised elif r.isPrimary is True: assert len(r.requestQueues[DOMAIN_LEDGER_ID]) == 0 reqs = sdk_signed_random_requests(looper, sdk_wallet_client, tconf.Max3PCBatchSize - 1) req_resps = sdk_send_signed_requests(sdk_pool_handle, reqs) # Only non primary replicas should have all request keys with them looper.run(eventually(chk, tconf.Max3PCBatchSize - 1)) sdk_get_replies(looper, req_resps, timeout=sdk_eval_timeout( tconf.Max3PCBatchSize - 1, len(txnPoolNodeSet), add_delay_to_timeout=delay_3pc)) # Replicas should have no request keys with them since they are ordered looper.run(eventually(chk, 0)) # Need to wait since one node might not # have processed it. delay = 1 for node in txnPoolNodeSet: node.nodeIbStasher.delay(nom_delay(delay)) ensure_view_change(looper, txnPoolNodeSet) reqs = sdk_signed_random_requests(looper, sdk_wallet_client, 2 * tconf.Max3PCBatchSize) req_resps = sdk_send_signed_requests(sdk_pool_handle, reqs) looper.run(eventually(chk, 2 * tconf.Max3PCBatchSize)) # Since each nomination is delayed and there will be multiple nominations # so adding some extra time timeout = waits.expectedPoolElectionTimeout(len(txnPoolNodeSet)) + \ len(txnPoolNodeSet) * delay ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=timeout) sdk_get_replies(looper, req_resps, timeout=timeout) looper.run(eventually(chk, 0))
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].ledger_summary slow_nodes = [r.node for r in getNonPrimaryReplicas( txnPoolNodeSet, 0)[-2:]] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [n.master_replica.last_prepared_certificate_in_view() for n in nodes] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def test_no_propagate_request_on_different_prepares_on_backup_before_vc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): ''' Send random request and do view change then fast_nodes (2,3 - with primary backup replica) will have prepare or send preprepare on backup replicas and slow_nodes are have not and transaction will ordered on all master replicas. Check last ordered after view change and after another one request.''' sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) slow_instance = 1 slow_nodes = txnPoolNodeSet[1:3] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] nodes_stashers = [n.nodeIbStasher for n in slow_nodes] old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc with delay_rules(nodes_stashers, pDelay(instId=slow_instance)): with delay_rules(nodes_stashers, ppDelay(instId=slow_instance)): # send one request sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) old_view_no = txnPoolNodeSet[0].viewNo looper.run( eventually(is_prepared, fast_nodes, 2, slow_instance)) # trigger view change on all nodes ensure_view_change(looper, txnPoolNodeSet) # wait for view change done on all nodes ensureElectionsDone(looper, txnPoolNodeSet) primary = getPrimaryReplica(txnPoolNodeSet, slow_instance).node non_primaries = [n for n in txnPoolNodeSet if n is not primary] check_last_ordered(non_primaries, slow_instance, (old_view_no, old_last_ordered[1] + 1)) # Backup primary replica must not advance last_ordered_3pc # up to the master's value check_last_ordered([primary], slow_instance, (old_view_no, old_last_ordered[1])) check_last_ordered(txnPoolNodeSet, txnPoolNodeSet[0].master_replica.instId, (old_last_ordered[0], old_last_ordered[1] + 1)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) looper.run( eventually(check_last_ordered, txnPoolNodeSet, slow_instance, (txnPoolNodeSet[0].viewNo, 1))) assert all(0 == node.spylog.count(node.request_propagates) for node in txnPoolNodeSet)
def test_node_notified_about_primary_election_result(txnPoolNodeSet, looper): old_counts = {node.name: get_count( node, node.primary_selected) for node in txnPoolNodeSet} ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) for node in txnPoolNodeSet: assert get_count(node, node.primary_selected) > old_counts[node.name]
def changeNodeHa(looper, txnPoolNodeSet, tconf, shouldBePrimary, tdir, sdk_pool_handle, sdk_wallet_stewards, sdk_wallet_client): # prepare new ha for node and client stack subjectedNode = None node_index = None for nodeIndex, n in enumerate(txnPoolNodeSet): if shouldBePrimary == n.has_master_primary: subjectedNode = n node_index = nodeIndex break nodeStackNewHA, clientStackNewHA = genHa(2) logger.debug("change HA for node: {} to {}".format( subjectedNode.name, (nodeStackNewHA, clientStackNewHA))) # change HA sdk_wallet_steward = sdk_wallet_stewards[node_index] node_dest = hexToFriendly(subjectedNode.nodestack.verhex) sdk_send_update_node(looper, sdk_wallet_steward, sdk_pool_handle, node_dest, subjectedNode.name, nodeStackNewHA[0], nodeStackNewHA[1], clientStackNewHA[0], clientStackNewHA[1], services=[VALIDATOR]) # stop node for which HA will be changed subjectedNode.stop() looper.removeProdable(subjectedNode) # start node with new HA config_helper = PNodeConfigHelper(subjectedNode.name, tconf, chroot=tdir) restartedNode = TestNode(subjectedNode.name, config_helper=config_helper, config=tconf, ha=nodeStackNewHA, cliha=clientStackNewHA) looper.add(restartedNode) txnPoolNodeSet[nodeIndex] = restartedNode looper.run(checkNodesConnected(txnPoolNodeSet, customTimeout=70)) electionTimeout = waits.expectedPoolElectionTimeout( nodeCount=len(txnPoolNodeSet), numOfReelections=3) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=1, customTimeout=electionTimeout) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 8)
def all_nodes_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): for _ in range(5): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_primary_selection_after_primary_demotion_and_pool_restart(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, txnPoolMasterNodes, tdir, tconf): """ Demote primary and restart the pool. Pool should select new primary and have viewNo=0 after restart. """ logger.info("1. turn off the node which has primary replica for master instanse") master_node = txnPoolMasterNodes[0] node_dest = hexToFriendly(master_node.nodestack.verhex) sdk_send_update_node(looper, sdk_wallet_steward, sdk_pool_handle, node_dest, master_node.name, None, None, None, None, services=[]) restNodes = [node for node in txnPoolNodeSet if node.name != master_node.name] ensureElectionsDone(looper, restNodes) # ensure pool is working properly logger.info("2. restart pool") # Stopping existing nodes for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # Starting nodes again by creating `Node` objects since that simulates # what happens when starting the node with script restartedNodes = [] for node in txnPoolNodeSet: config_helper = PNodeConfigHelper(node.name, tconf, chroot=tdir) restartedNode = TestNode(node.name, config_helper=config_helper, config=tconf, ha=node.nodestack.ha, cliha=node.clientstack.ha) looper.add(restartedNode) restartedNodes.append(restartedNode) restNodes = [node for node in restartedNodes if node.name != master_node.name] looper.run(checkNodesConnected(restNodes)) ensureElectionsDone(looper, restNodes) checkViewNoForNodes(restNodes, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 3) primariesIdxs = getPrimaryNodesIdxs(restNodes) assert restNodes[primariesIdxs[0]].name != master_node.name
def test_primary_selection_after_demoted_primary_node_promotion( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, txnPoolMasterNodes): """ Demote primary of master instance, wait for view change and promote it back. Check primaries for instances. """ assert len(txnPoolNodeSet) == 4 # Check primaries after test setup. primariesIdxs = getPrimaryNodesIdxs(txnPoolNodeSet) assert len(primariesIdxs) == 2 assert primariesIdxs[0] == 0 assert primariesIdxs[1] == 1 master_node = txnPoolMasterNodes[0] # Demote primary of master instance. node_dest = hexToFriendly(master_node.nodestack.verhex) sdk_send_update_node(looper, sdk_wallet_steward, sdk_pool_handle, node_dest, master_node.name, None, None, None, None, services=[]) restNodes = [node for node in txnPoolNodeSet if node.name != master_node.name] ensureElectionsDone(looper, restNodes) # Check that there is only one instance now, check it's primary. primariesIdxs = getPrimaryNodesIdxs(restNodes) assert len(primariesIdxs) == 1 assert primariesIdxs[0] == 1 # Ensure pool is working properly. sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 3) # Promote demoted node back. sdk_send_update_node(looper, sdk_wallet_steward, sdk_pool_handle, node_dest, master_node.name, None, None, None, None, services=[VALIDATOR]) # Ensure pool is working properly. sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 3) # Check that there are two instances again, check their primaries. primariesIdxs = getPrimaryNodesIdxs(txnPoolNodeSet) assert len(primariesIdxs) == 2 assert primariesIdxs[0] == 2 assert primariesIdxs[1] == 3
def changeNodeHa(looper, txnPoolNodeSet, tdirWithPoolTxns, poolTxnData, poolTxnStewardNames, tconf, shouldBePrimary): # prepare new ha for node and client stack subjectedNode = None stewardName = None stewardsSeed = None for nodeIndex, n in enumerate(txnPoolNodeSet): if (shouldBePrimary and n.primaryReplicaNo == 0) or \ (not shouldBePrimary and n.primaryReplicaNo != 0): subjectedNode = n stewardName = poolTxnStewardNames[nodeIndex] stewardsSeed = poolTxnData["seeds"][stewardName].encode() break nodeStackNewHA, clientStackNewHA = genHa(2) logger.debug("change HA for node: {} to {}". format(subjectedNode.name, (nodeStackNewHA, clientStackNewHA))) nodeSeed = poolTxnData["seeds"][subjectedNode.name].encode() # change HA stewardClient, req = changeHA(looper, tconf, subjectedNode.name, nodeSeed, nodeStackNewHA, stewardName, stewardsSeed) f = getMaxFailures(len(stewardClient.nodeReg)) looper.run(eventually(checkSufficientRepliesRecvd, stewardClient.inBox, req.reqId, f, retryWait=1, timeout=20)) # stop node for which HA will be changed subjectedNode.stop() looper.removeProdable(subjectedNode) # start node with new HA restartedNode = TestNode(subjectedNode.name, basedirpath=tdirWithPoolTxns, config=tconf, ha=nodeStackNewHA, cliha=clientStackNewHA) looper.add(restartedNode) txnPoolNodeSet[nodeIndex] = restartedNode looper.run(checkNodesConnected(txnPoolNodeSet, overrideTimeout=70)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=1, timeout=10) # start client and check the node HA anotherClient, _ = genTestClient(tmpdir=tdirWithPoolTxns, usePoolLedger=True) looper.add(anotherClient) looper.run(eventually(anotherClient.ensureConnectedToNodes)) stewardWallet = Wallet(stewardName) stewardWallet.addIdentifier(signer=SimpleSigner(seed=stewardsSeed)) sendReqsToNodesAndVerifySuffReplies(looper, stewardWallet, stewardClient, 8) looper.run(eventually(checkIfGenesisPoolTxnFileUpdated, *txnPoolNodeSet, stewardClient, anotherClient, retryWait=1, timeout=10)) looper.removeProdable(stewardClient)
def testZStackNodeReconnection(tconf, looper, txnPoolNodeSet, client1, wallet1, tdir, client1Connected): sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 1) npr = [n for n in txnPoolNodeSet if not n.hasPrimary] nodeToCrash = npr[0] idxToCrash = txnPoolNodeSet.index(nodeToCrash) otherNodes = [_ for _ in txnPoolNodeSet if _ != nodeToCrash] def checkFlakyConnected(conn=True): for node in otherNodes: if conn: assert nodeToCrash.nodestack.name in node.nodestack.connecteds else: assert nodeToCrash.nodestack.name not in node.nodestack.connecteds checkFlakyConnected(True) nodeToCrash.stop() logger.debug('Stopped node {}'.format(nodeToCrash)) looper.removeProdable(nodeToCrash) looper.runFor(1) stopNodes([nodeToCrash], looper) # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, False, retryWait=1, timeout=60)) looper.runFor(1) config_helper = PNodeConfigHelper(nodeToCrash.name, tconf, chroot=tdir) node = TestNode(nodeToCrash.name, ledger_dir=config_helper.ledger_dir, keys_dir=config_helper.keys_dir, genesis_dir=config_helper.genesis_dir, plugins_dir=config_helper.plugins_dir, config=tconf, ha=nodeToCrash.nodestack.ha, cliha=nodeToCrash.clientstack.ha) looper.add(node) txnPoolNodeSet[idxToCrash] = node # TODO Select or create the timeout from 'waits'. Don't use constant. looper.run(eventually(checkFlakyConnected, True, retryWait=2, timeout=50)) ensureElectionsDone(looper, txnPoolNodeSet, retryWait=2) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 10)
def testClientConnectToRestartedNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, poolTxnClientNames, poolTxnData, tconf, poolTxnNodeNames, allPluginsPath): name = poolTxnClientNames[-1] newClient, w = genTestClient(tmpdir=tdirWithPoolTxns, nodes=txnPoolNodeSet, name=name, usePoolLedger=True) looper.add(newClient) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 1, 1) for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # looper.run(newClient.ensureDisconnectedToNodes(timeout=60)) txnPoolNodeSet = [] for nm in poolTxnNodeNames: node = TestNode(nm, basedirpath=tdirWithPoolTxns, base_data_dir=tdirWithPoolTxns, config=tconf, pluginPaths=allPluginsPath) looper.add(node) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) def chk(): for node in txnPoolNodeSet: assert node.isParticipating timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run(eventually(chk, retryWait=1, timeout=timeout)) bootstrapClientKeys(w.defaultId, w.getVerkey(), txnPoolNodeSet) req = sendRandomRequest(w, newClient) waitForSufficientRepliesForRequests(looper, newClient, requests=[req]) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 3, 1)
def testNodesConnectsWhenOneNodeIsLate(allPluginsPath, tdir_for_func, tconf_for_func, looper_without_nodeset_for_func, nodeReg): looper = looper_without_nodeset_for_func initLocalKeys(tdir_for_func, tconf_for_func, nodeReg) nodes = [] names = list(nodeReg.keys()) logger.debug("Node names: {}".format(names)) def create(name): config_helper = PNodeConfigHelper(name, tconf_for_func, chroot=tdir_for_func) node = TestNode(name, nodeReg, config_helper=config_helper, config=tconf_for_func, pluginPaths=allPluginsPath) nodes.append(node) return node for name in names: create(name) logger.debug("Creating keys") for node in nodes: tellKeysToOthers(node, nodes) for node in nodes[:3]: looper.add(node) looper.run(checkNodesConnected(nodes[:3])) # wait for the election to complete with the first three nodes ensureElectionsDone(looper, nodes[:3], instances_list=range(2)) # start the fourth and see that it learns who the primaries are # from the other nodes looper.add(nodes[3]) # ensure election is done for updated pool ensureElectionsDone(looper, nodes) stopNodes(nodes, looper) for node in nodes: looper.removeProdable(node)
def test_order_after_demote_and_restart(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath, sdk_wallet_stewards): sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 3, 3) primary_node = txnPoolNodeSet[0] node_to_stop = txnPoolNodeSet[1] node_to_demote = txnPoolNodeSet[2] txnPoolNodeSet.remove(node_to_demote) node_to_stop.cleanupOnStopping = True node_to_stop.stop() looper.removeProdable(node_to_stop) ensure_node_disconnected(looper, node_to_stop, txnPoolNodeSet, timeout=2) demote_node(looper, sdk_wallet_stewards[2], sdk_pool_handle, node_to_demote) config_helper = PNodeConfigHelper(node_to_stop.name, tconf, chroot=tdir) restarted_node = TestNode(node_to_stop.name, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath, ha=node_to_stop.nodestack.ha, cliha=node_to_stop.clientstack.ha) looper.add(restarted_node) txnPoolNodeSet[1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, check_primaries=False) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1, 1) def get_current_bls_keys(node): return node.master_replica._bls_bft_replica._bls_bft.bls_key_register._current_bls_keys assert get_current_bls_keys(restarted_node) == get_current_bls_keys( primary_node)
def check_view_change_adding_new_node(looper, tdir, tconf, allPluginsPath, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, sdk_wallet_steward, slow_nodes=[], delay_commit=False, delay_pre_prepare=False): # Pre-requisites: viewNo=3, Primary is Node4 for viewNo in range(1, 4): trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, viewNo) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30) # Delay 3PC messages on slow nodes fast_nodes = [node for node in txnPoolNodeSet if node not in slow_nodes] slow_stashers = [slow_node.nodeIbStasher for slow_node in slow_nodes] delayers = [] if delay_pre_prepare: delayers.append(ppDelay()) delayers.append(msg_rep_delay(types_to_delay=[PREPREPARE])) if delay_commit: delayers.append(cDelay()) with delay_rules_without_processing(slow_stashers, *delayers): # Add Node5 new_node = add_new_node(looper, fast_nodes, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath) old_set = list(txnPoolNodeSet) txnPoolNodeSet.append(new_node) # Trigger view change trigger_view_change(txnPoolNodeSet) # make sure view change is finished eventually waitForViewChange(looper, old_set, 4) ensureElectionsDone(looper, old_set) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_no_propagate_request_on_different_last_ordered_on_master_before_vc( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): ''' Send random request and do view change then fast_nodes (1, 4 - without primary after next view change) are already ordered transaction on master and slow_nodes are not. Check ordering on slow_nodes.''' global batches_count sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 master_instance = txnPoolNodeSet[0].master_replica.instId slow_nodes = txnPoolNodeSet[1:3] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] nodes_stashers = [n.nodeIbStasher for n in slow_nodes] old_last_ordered = txnPoolNodeSet[0].master_replica.last_ordered_3pc assert batches_count == old_last_ordered[1] with delay_rules(nodes_stashers, cDelay()): # send one request requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 1) batches_count += 1 last_ordered_for_slow = slow_nodes[0].master_replica.last_ordered_3pc old_view_no = txnPoolNodeSet[0].viewNo looper.run( eventually(check_last_ordered, fast_nodes, master_instance, (old_view_no, batches_count))) # trigger view change on all nodes ensure_view_change(looper, txnPoolNodeSet) # wait for view change done on all nodes ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=60) batches_count += 1 replies = sdk_get_replies(looper, requests) for reply in replies: sdk_check_reply(reply) # a new primary will send a PrePrepare for the new view looper.run( eventually(check_last_ordered, txnPoolNodeSet, master_instance, (old_view_no + 1, batches_count))) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) assert all(0 == node.spylog.count(node.request_propagates) for node in txnPoolNodeSet)
def test_view_change_not_happen_if_ic_is_discarded(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. panic_node (Delta) send InstanceChange for all nodes. 2. Restart nodes_to_restart (Beta, Gamma). 3. Wait OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL sec. 4. nodes_to_restart send InstanceChanges for all nodes. 5. View change doesn't happen since old InstanceChange from panic_node was discarded due to timeout. 5. Ensure elections done """ nodes_to_restart = txnPoolNodeSet[1:3] panic_node = txnPoolNodeSet[-1] view_no = txnPoolNodeSet[0].viewNo panic_node.view_changer.on_master_degradation() for n in nodes_to_restart: restart_node(looper, txnPoolNodeSet, n, tconf, tdir, allPluginsPath) nodes_to_restart = txnPoolNodeSet[1:3] # waiting to discard InstanceChange def check_old_ic_discarded(): vct_services = [n.master_replica._view_change_trigger_service for n in txnPoolNodeSet] assert all(not vct_service._instance_changes.has_inst_chng_from(view_no + 1, panic_node.name) for vct_service in vct_services) looper.run(eventually(check_old_ic_discarded, timeout=tconf.OUTDATED_INSTANCE_CHANGES_CHECK_INTERVAL + 1)) for n in nodes_to_restart: n.view_changer.on_master_degradation() def check_ic(): for node in txnPoolNodeSet: vct_service = node.master_replica._view_change_trigger_service assert all(vct_service._instance_changes.has_inst_chng_from(view_no + 1, n.name) for n in nodes_to_restart) looper.run(eventually(check_ic)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) for node in txnPoolNodeSet: assert node.viewNo == view_no
def test_replica_removing_with_primary_disconnected(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. Remove backup primary node. 2. Check that replicas with the disconnected primary were removed. 3. Recover the removed node. 4. Start View Change. 5. Check that all replicas were restored. """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas instance_to_remove = 1 node = txnPoolNodeSet[instance_to_remove] # remove backup primary node. disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node) txnPoolNodeSet.remove(node) looper.removeProdable(node) # check that replicas were removed def check_replica_removed_on_all_nodes(): for node in txnPoolNodeSet: check_replica_removed(node, start_replicas_count, instance_to_remove) looper.run( eventually(check_replica_removed_on_all_nodes, timeout=tconf.TolerateBackupPrimaryDisconnection * 4)) assert not node.monitor.isMasterDegraded() assert len(node.requests) == 0 # recover the removed node node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) # start View Change trigger_view_change(txnPoolNodeSet) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert start_replicas_count == node.replicas.num_replicas
def test_view_change_with_instance_change_lost_due_to_restarts( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. some_nodes (Beta and Gamma) send InstanceChange for all nodes. 2. Restart other_nodes (Gamma and Delta) 3. last_node (Delta) send InstanceChange for all nodes. 4. Ensure elections done and pool is functional """ current_view_no = txnPoolNodeSet[0].viewNo some_nodes = txnPoolNodeSet[1:3] other_nodes = txnPoolNodeSet[2:4] for n in some_nodes: send_test_instance_change(n) def check_ic_delivery(): for node in txnPoolNodeSet: vct_service = node.master_replica._view_change_trigger_service assert all( vct_service._instance_changes.has_inst_chng_from( current_view_no + 1, sender.name) for sender in some_nodes) looper.run(eventually(check_ic_delivery)) restart_nodes(looper, txnPoolNodeSet, other_nodes, tconf, tdir, allPluginsPath, start_one_by_one=False) last_node = txnPoolNodeSet[-1] send_test_instance_change(last_node) waitForViewChange(looper, txnPoolNodeSet, current_view_no + 1, customTimeout=3 * FRESHNESS_TIMEOUT) ensureElectionsDone(looper, txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def txnPoolNodeSet(patchPluginManager, txnPoolNodesLooper, tdirWithPoolTxns, tdirWithDomainTxns, tconf, poolTxnNodeNames, allPluginsPath, tdirWithNodeKeepInited, testNodeClass): nodes = [] for nm in poolTxnNodeNames: node = testNodeClass(nm, basedirpath=tdirWithPoolTxns, config=tconf, pluginPaths=allPluginsPath) txnPoolNodesLooper.add(node) nodes.append(node) txnPoolNodesLooper.run(checkNodesConnected(nodes)) ensureElectionsDone(looper=txnPoolNodesLooper, nodes=nodes, retryWait=1, timeout=20) return nodes
def test_master_primary_different_from_previous_view_for_itself( txnPoolNodeSet, looper, client1, wallet1): """ After a view change, primary must be different from previous primary for master instance, it does not matter for other instance. Break it into 2 tests, one where the primary is malign and votes for itself but is still not made primary in the next view. """ old_view_no = checkViewNoForNodes(txnPoolNodeSet) pr = slow_primary(txnPoolNodeSet, 0, delay=10) old_pr_node = pr.node def _get_undecided_inst_id(self): undecideds = [i for i, r in enumerate(self.replicas) if r.isPrimary is None] # Try to nominate for the master instance return undecideds, 0 # Patching old primary's elector's method to nominate itself # again for the the new view old_pr_node.elector._get_undecided_inst_id = types.MethodType( _get_undecided_inst_id, old_pr_node.elector) # View change happens provoke_and_wait_for_view_change(looper, txnPoolNodeSet, old_view_no + 1, wallet1, client1) # Elections done ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # New primary is not same as old primary assert getPrimaryReplica(txnPoolNodeSet, 0).node.name != old_pr_node.name # All other nodes discarded the nomination by the old primary for node in txnPoolNodeSet: if node != old_pr_node: assert countDiscarded(node.elector, 'of master in previous view too') == 1 # The new primary can still process requests sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5)
def testSelfNominationDelay(tdir_for_func): nodeNames = ["testA", "testB", "testC", "testD"] with TestNodeSet(names=nodeNames, tmpdir=tdir_for_func) as nodeSet: with Looper(nodeSet) as looper: prepareNodeSet(looper, nodeSet) delay = 30 # Add node A nodeA = addNodeBack(nodeSet, looper, nodeNames[0]) nodeA.delaySelfNomination(delay) nodesBCD = [] for name in nodeNames[1:]: # nodesBCD.append(nodeSet.addNode(name, i+1, AutoMode.never)) nodesBCD.append(addNodeBack(nodeSet, looper, name)) # Ensuring that NodeA is started before any other node to demonstrate # that it is delaying self nomination looper.run( eventually(lambda: assertExp(nodeA.isReady()), retryWait=1, timeout=5)) # Elections should be done ensureElectionsDone(looper=looper, nodes=nodeSet, retryWait=1, timeout=10) # node A should not have any primary replica looper.run( eventually(lambda: assertExp(not nodeA.hasPrimary), retryWait=1, timeout=10)) # Make sure that after at the most 30 seconds, nodeA's # `startElection` is called looper.run( eventually(lambda: assertExp( len(nodeA.spylog.getAll(Node.decidePrimaries.__name__)) > 0 ), retryWait=1, timeout=30))
def test_view_change_with_next_primary_stopped(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): old_view_no = checkViewNoForNodes(txnPoolNodeSet) next_primary = get_next_primary_name(txnPoolNodeSet, old_view_no + 1) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, next_primary) remaining_nodes = [ node for node in txnPoolNodeSet if node.name != next_primary ] trigger_view_change(remaining_nodes, old_view_no + 1) ensureElectionsDone(looper, remaining_nodes, instances_list=range(2), customTimeout=15) sdk_ensure_pool_functional(looper, remaining_nodes, sdk_wallet_client, sdk_pool_handle) current_view_no = checkViewNoForNodes(remaining_nodes) assert current_view_no == old_view_no + 2
def test_new_primary_lagging_behind(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf): initial_view_no = checkViewNoForNodes(txnPoolNodeSet) next_primary_name = get_next_primary_name(txnPoolNodeSet, initial_view_no + 1) next_primary = [n for n in txnPoolNodeSet if n.name == next_primary_name][0] expected_primary_name = get_next_primary_name(txnPoolNodeSet, initial_view_no + 2) # Next primary cannot stabilize 1 checkpoint with delay_rules_without_processing(next_primary.nodeIbStasher, cDelay(), pDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, CHK_FREQ) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=2 * tconf.NEW_VIEW_TIMEOUT) assert next_primary_name != expected_primary_name assert checkViewNoForNodes(txnPoolNodeSet) == initial_view_no + 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_nodes_removes_request_keys_for_ordered(setup, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node does not order requests since it is missing some 3PC messages, gets them from catchup. It then clears them from its request queues """ slow_node, fast_nodes = setup reqs = sdk_json_couples_to_request_list( send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 5)) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc != \ fast_nodes[0].master_replica.last_ordered_3pc def chk(key, nodes, present): for node in nodes: assert (key in node.master_replica._ordering_service. requestQueues[DOMAIN_LEDGER_ID]) == present for req in reqs: chk(req.digest, fast_nodes, False) chk(req.digest, [slow_node], True) # Reset catchup reply delay so that catchup can complete slow_node.nodeIbStasher.reset_delays_and_process_delayeds( CatchupRep.typename) old_last_ordered = fast_nodes[0].master_replica.last_ordered_3pc ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, fast_nodes) assert slow_node.master_replica.last_ordered_3pc == old_last_ordered for req in reqs: chk(req.digest, txnPoolNodeSet, False) # Needed for the next run due to the parametrised fixture slow_node.reset_delays_and_process_delayeds()
def newNodeAdded(looper, nodeSet, tdir, tconf, sdk_pool_handle, sdk_wallet_trustee, allPluginsPath): view_no = nodeSet[0].viewNo new_steward_wallet, new_node = sdk_node_theta_added( looper, nodeSet, tdir, tconf, sdk_pool_handle, sdk_wallet_trustee, allPluginsPath, node_config_helper_class=NodeConfigHelper, testNodeClass=TestNode, name='') waitForViewChange(looper=looper, txnPoolNodeSet=nodeSet, expectedViewNo=view_no + 1) ensureElectionsDone(looper=looper, nodes=nodeSet) return new_steward_wallet, new_node
def test_view_change_restarted_by_timeout_if_next_primary_disconnected( txnPoolNodeSet, looper, tconf, setup): """ Verifies that a view change is restarted by timeout if the next primary has been disconnected """ _, initial_view_no, timeout_callback_stats = setup start_view_change(txnPoolNodeSet, initial_view_no + 1) alive_nodes = stop_master_primary(txnPoolNodeSet, initial_view_no + 1) ensureElectionsDone(looper=looper, nodes=alive_nodes, instances_list=range(3)) # There were 2 view changes for node in alive_nodes: assert (node.viewNo - initial_view_no) == 2 # The timeout method was called 1 time check_watchdog_called_expected_times(txnPoolNodeSet, timeout_callback_stats, 1)
def test_select_primary_after_removed_backup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ Check correct order of primaries on backup replicas """ node = txnPoolNodeSet[0] start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) for node in txnPoolNodeSet: node.view_changer.on_master_degradation() ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) for n in txnPoolNodeSet: assert n.requiredNumberOfInstances == n.replicas.num_replicas for inst_id in range(n.requiredNumberOfInstances): assert n.replicas[inst_id].primaryName == \ txnPoolNodeSet[inst_id + 1].name + ":" + str(inst_id)
def test_set_H_greater_then_last_ppseqno(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath): # send LOG_SIZE requests and check, that all watermarks on all replicas is not changed # and now is (0, LOG_SIZE) """Send random requests for moving watermarks""" sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, LOG_SIZE) # check, that all of node set up watermark greater, then default and # ppSeqNo with number LOG_SIZE + 1 will be out from default watermark assert txnPoolNodeSet[0].replicas[1].last_ordered_3pc[1] == LOG_SIZE for n in txnPoolNodeSet: for r in n.replicas._replicas.values(): assert r.h >= LOG_SIZE assert r.H >= LOG_SIZE + LOG_SIZE """Adding new node, for scheduling propagate primary procedure""" new_node = add_new_node(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tdir, tconf, allPluginsPath) ensure_all_nodes_have_same_data( looper, txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup']) """Check, that backup replicas set watermark as (0, maxInt)""" # Check, replica.h is set from last_ordered_3PC and replica.H is set to maxsize for r in new_node.replicas.values(): assert r.h == r.last_ordered_3pc[1] if r.isMaster: assert r.H == r.last_ordered_3pc[1] + LOG_SIZE else: assert r.H == sys.maxsize """Send requests and check. that backup replicas does not stashing it by outside watermarks reason""" sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) # check, that there is no any stashed "outside watermark" messages. for r in new_node.replicas.values(): assert r.stasher.stash_size(STASH_WATERMARKS) == 0 """Force view change and check, that all backup replicas will reset watermarks""" ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) for r in new_node.replicas.values(): if not r.isMaster: assert r.h == 0 assert r.H == LOG_SIZE
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdir, allPluginsPath, tconf, wallet1, client1): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) ensure_view_change(looper, remaining_nodes) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals( restarted_node.view_changer, restarted_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) assert not restarted_node.view_changer._next_view_indications
def test_first_audit_catchup_during_ordering(monkeypatch, looper, tconf, tdir, allPluginsPath, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): # 1. patch primaries in audit ledger for n in txnPoolNodeSet: patch_primaries_in_audit(n, monkeypatch) # 2. order a txn sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) # 3. restart Nodes 3 and 4 restart_nodes(looper, txnPoolNodeSet, txnPoolNodeSet[2:], tconf, tdir, allPluginsPath, start_one_by_one=False) for n in txnPoolNodeSet[2:]: patch_primaries_in_audit(n, monkeypatch) # 5. make sure that all node have equal Priamries and can order ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=20) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_set_H_as_maxsize_for_backup_if_is_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) primary_on_backup = txnPoolNodeSet[2] assert primary_on_backup.replicas._replicas[1].isPrimary # Stop Node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_on_backup, stopNode=True) txnPoolNodeSet.remove(primary_on_backup) looper.removeProdable(primary_on_backup) # Start stopped Node primary_on_backup = start_stopped_node(primary_on_backup, looper, tconf, tdir, allPluginsPath) # Delay 3PC messages so that when restarted node does not have them ordered with delay_rules(primary_on_backup.nodeIbStasher, delay_3pc()): txnPoolNodeSet.append(primary_on_backup) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.NEW_VIEW_TIMEOUT) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, LOG_SIZE) # Check restored state assert primary_on_backup.replicas._replicas[1].isPrimary assert primary_on_backup.replicas._replicas[1].h == 1 assert primary_on_backup.replicas._replicas[1].H == 1 + LOG_SIZE def chk(): assert primary_on_backup.replicas._replicas[1].h == LOG_SIZE assert primary_on_backup.replicas._replicas[1].H == LOG_SIZE + LOG_SIZE # Check caught-up state looper.run(eventually(chk, retryWait=.2, timeout=tconf.NEW_VIEW_TIMEOUT))
def test_view_change_with_next_primary_stopped_and_one_node_lost_commit(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, limitTestRunningTime): current_view_no = checkViewNoForNodes(txnPoolNodeSet) next_primary = get_next_primary_name(txnPoolNodeSet, current_view_no + 1) delayed_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet) if r.node.name != next_primary][0] other_nodes = [n for n in txnPoolNodeSet if n.name != next_primary] with delay_rules_without_processing(delayed_node.nodeIbStasher, cDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, next_primary) trigger_view_change(other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(2), customTimeout=15) ensure_all_nodes_have_same_data(looper, other_nodes) sdk_ensure_pool_functional(looper, other_nodes, sdk_wallet_client, sdk_pool_handle) ensure_all_nodes_have_same_data(looper, other_nodes)
def test_view_change_on_performance_degraded(looper, txnPoolNodeSet, viewNo, sdk_pool_handle, sdk_wallet_steward): """ Test that a view change is done when the performance of master goes down Send multiple requests from the client and delay some requests by master instance so that there is a view change. All nodes will agree that master performance degraded """ old_primary_node = get_master_primary_node(list(txnPoolNodeSet)) simulate_slow_master(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=viewNo + 1) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert old_primary_node.name != new_primary_node.name
def test_view_change_retry_by_timeout( txnPoolNodeSet, looper, setup, sdk_pool_handle, sdk_wallet_client): """ Verifies that a view change is restarted if it is not completed in time """ m_primary_node, initial_view_no, timeout_callback_stats = setup delay_view_change_done_msg(txnPoolNodeSet) start_view_change(txnPoolNodeSet, initial_view_no + 1) # First view change should fail, because of delayed ViewChangeDone # messages. This then leads to new view change that we need. with pytest.raises(AssertionError): ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet, customTimeout=view_change_timeout + 2) # Resetting delays to let second view change go well reset_delays_and_process_delayeds(txnPoolNodeSet) # This view change should be completed with no problems ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) new_m_primary_node = get_master_primary_node(list(txnPoolNodeSet)) assert m_primary_node.name != new_m_primary_node.name # The timeout method was called one time for node in txnPoolNodeSet: assert get_count(node, node._check_view_change_completed) - \ timeout_callback_stats[node.name]['called'] == 1 assert len(getAllReturnVals(node, node._check_view_change_completed, compare_val_to=True)) - \ timeout_callback_stats[node.name]['returned_true'] == 1 # 2 view changes have been initiated for node in txnPoolNodeSet: assert node.viewNo - initial_view_no == 2 sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_recover_stop_primaries_no_view_change(looper, checkpoint_size, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_steward): """ Test that we can recover after having more than f nodes disconnected: - send txns - stop current master primary - restart current master primary - send txns """ active_nodes = list(txnPoolNodeSet) assert 4 == len(active_nodes) initial_view_no = active_nodes[0].viewNo logger.info("send at least one checkpoint") assert nodes_do_not_have_checkpoints(*active_nodes) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2 * checkpoint_size) assert nodes_have_checkpoints(*active_nodes) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Stop first node (current Primary)") stopped_node, active_nodes = stop_primary(looper, active_nodes) logger.info("Restart the primary node") restarted_node = start_stopped_node(stopped_node, looper, tconf, tdir, allPluginsPath) assert nodes_do_not_have_checkpoints(restarted_node) assert nodes_have_checkpoints(*active_nodes) active_nodes = active_nodes + [restarted_node] logger.info("Check that primary selected") ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=0) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) logger.info("Check if the pool is able to process requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 10 * checkpoint_size) ensure_all_nodes_have_same_data(looper, nodes=active_nodes) assert nodes_have_checkpoints(*active_nodes)
def ensure_several_view_change(looper, nodes, vc_count=1, exclude_from_check=None, custom_timeout=None): """ This method patches the master performance check to return False and thus ensures that all given nodes do a view change Also, this method can do several view change. If you try do several view_change by calling ensure_view_change, than monkeypatching method isMasterDegraded would work unexpectedly. Therefore, we return isMasterDegraded only after doing view_change needed count """ old_meths = {} view_changes = {} expected_view_no = None for node in nodes: old_meths[node.name] = node.monitor.isMasterDegraded for __ in range(vc_count): old_view_no = checkViewNoForNodes(nodes) expected_view_no = old_view_no + 1 for node in nodes: view_changes[node.name] = node.monitor.totalViewChanges def slow_master(self): # Only allow one view change rv = self.totalViewChanges == view_changes[self.name] if rv: logger.info('{} making master look slow'.format(self)) return rv node.monitor.isMasterDegraded = types.MethodType(slow_master, node.monitor) node.checkPerformance() perf_check_freq = next(iter(nodes)).config.PerfCheckFreq timeout = custom_timeout or waits.expectedPoolViewChangeStartedTimeout(len(nodes)) + perf_check_freq nodes_to_check = nodes if exclude_from_check is None else [n for n in nodes if n not in exclude_from_check] logger.debug('Checking view no for nodes {}'.format(nodes_to_check)) looper.run(eventually(checkViewNoForNodes, nodes_to_check, expected_view_no, retryWait=1, timeout=timeout)) ensureElectionsDone(looper=looper, nodes=nodes, customTimeout=timeout) ensure_all_nodes_have_same_data(looper, nodes, custom_timeout=timeout, exclude_from_check=exclude_from_check) return expected_view_no
def scenario_txns_during_view_change(looper, nodes, curr_utxo, send_txns, send_txns_invalid=None): lagging_node = nodes[-1] rest_nodes = nodes[:-1] def send_txns_invalid_default(): curr_utxo['amount'] += 1 with pytest.raises(RequestRejectedException, match='Insufficient funds'): send_txns() curr_utxo['amount'] -= 1 # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, nodes) # Lag one node (delay Prepare and Commit messages for lagging_node) with delay_rules(lagging_node.nodeIbStasher, pDelay(), cDelay()): # Send more transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes) # Send invalid transactions (send_txns_invalid or send_txns_invalid_default)() ensure_all_nodes_have_same_data(looper, rest_nodes) # Initiate view change # Wait until view change is finished and check that needed transactions are written. ensure_view_change(looper, nodes) ensureElectionsDone(looper, nodes) # Reset delays # Make sure that all nodes have equal state # (expecting that lagging_node caught up missed ones) ensure_all_nodes_have_same_data(looper, nodes) # make sure the poll is functional send_txns() ensure_all_nodes_have_same_data(looper, nodes)
def test_view_change_after_some_txns( txnPoolNodesLooper, txnPoolNodeSet, some_txns_done, testNodeClass, viewNo, # noqa sdk_pool_handle, sdk_wallet_client, node_config_helper_class, tconf, tdir, allPluginsPath, tmpdir_factory): """ Check that view change is done after processing some of txns """ ensure_view_change(txnPoolNodesLooper, txnPoolNodeSet) ensureElectionsDone(looper=txnPoolNodesLooper, nodes=txnPoolNodeSet) ensure_all_nodes_have_same_data(txnPoolNodesLooper, nodes=txnPoolNodeSet) sdk_send_random_and_check(txnPoolNodesLooper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10) ensure_all_nodes_have_same_data(txnPoolNodesLooper, txnPoolNodeSet) for node in txnPoolNodeSet: txnPoolNodesLooper.removeProdable(node) node.stop() config = getConfigOnce() reload_modules_for_replay(tconf) replayable_node_class, basedirpath = get_replayable_node_class( tmpdir_factory, tdir, testNodeClass, config) print('-------------Replaying now---------------------') for node in txnPoolNodeSet: create_replayable_node_and_check(txnPoolNodesLooper, txnPoolNodeSet, node, replayable_node_class, node_config_helper_class, tconf, basedirpath, allPluginsPath)
def test_demote_backup_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_stewards, tdir, tconf, allPluginsPath): assert len(txnPoolNodeSet) == 6 view_no = txnPoolNodeSet[-1].viewNo sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_stewards[0], 1) node_to_restart = txnPoolNodeSet[-1] node_to_demote = steward_for_demote_node = demote_node_index = None steward_for_demote_node = None for i, n in enumerate(txnPoolNodeSet): if n.name == txnPoolNodeSet[0].primaries[1]: node_to_demote = n steward_for_demote_node = sdk_wallet_stewards[i] demote_node_index = i break assert node_to_demote demote_node(looper, steward_for_demote_node, sdk_pool_handle, node_to_demote) del txnPoolNodeSet[demote_node_index] # we are expecting 2 view changes here since Beta is selected as a master Primary on view=1 # (since node reg at the beginning of view 0 is used to select it), but it's not available (demoted), # so we do view change to view=2 by timeout waitForViewChange(looper, txnPoolNodeSet, view_no + 2) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart) looper.removeProdable(name=node_to_restart.name) node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_restart looper.run(checkNodesConnected(txnPoolNodeSet)) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_stewards[0], 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=20)