def testNodeCatchupAfterDisconnect( sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Disconnecting node {} with pool ledger size {}".format( new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Connecting the stopped node, {}".format(new_node)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) logger.debug("Waiting for the node to catch up, {}".format(new_node)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 10) checkNodeDataForEquality(new_node, *txnPoolNodeSet[:-1])
def testNodeCatchupAfterLostConnection(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that has poor internet connection and got unsynced after some transactions should eventually get the transactions which happened while it was not accessible :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Disconnecting node {}, ledger size {}".format( newNode, newNode.domainLedger.size)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) # logger.debug("Ensure node {} gets disconnected".format(newNode)) ensure_node_disconnected(looper, newNode, txnPoolNodeSet[:-1]) logger.debug("Connecting the node {} back, ledger size {}".format( newNode, newNode.domainLedger.size)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def test_node_requests_missing_three_phase_messages(looper, txnPoolNodeSet, wallet1, client1Connected): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1Connected, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) init_ledger_size = txnPoolNodeSet[0].domainLedger.size for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=False) sendRandomRequests(wallet1, client1Connected, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1Connected, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def testNodeCatchupAfterDisconnect(sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Disconnecting node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Connecting the stopped node, {}".format(new_node)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) logger.debug("Waiting for the node to catch up, {}".format(new_node)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 10) checkNodeDataForEquality(new_node, *txnPoolNodeSet[:-1])
def test_idr_cache_update_after_catchup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): wallet_handle, identifier = sdk_wallet_steward node_to_disconnect = txnPoolNodeSet[-1] req_handler = node_to_disconnect.getDomainReqHandler() disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect.name, stopNode=False) looper.runFor(2) idr, verkey = createHalfKeyIdentifierAndAbbrevVerkey() request = looper.loop.run_until_complete(build_nym_request(identifier, idr, verkey, None, None)) req_signed = looper.loop.run_until_complete(sign_request(wallet_handle, identifier, request)) result = json.loads(looper.loop.run_until_complete(submit_request(sdk_pool_handle, req_signed))) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node_to_disconnect.name) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet) key = domain.make_state_path_for_nym(idr) root_hash = req_handler.ts_store.get_equal_or_prev(result['result']['txnTime']) from_state = req_handler.state.get_for_root_hash(root_hash=root_hash, key=key) assert from_state deserialized = req_handler.stateSerializer.deserialize(from_state) assert deserialized items_after = req_handler.idrCache.get(idr) assert items_after
def testNodeCatchupAfterDisconnect(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}".format( newNode, newNode.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) looper.removeProdable(newNode) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Starting the stopped node, {}".format(newNode)) looper.add(newNode) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def test_6_nodes_pool_cannot_reach_quorum_with_2_disconnected( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): ''' Check that we can not reach consensus when more than n-f nodes are disconnected: disconnect 2 of 6 nodes ''' faulties = nodes_by_rank(txnPoolNodeSet)[-faultyNodes:] current_node_set = set(txnPoolNodeSet) for node in faulties: for r in node.replicas: assert not r.isPrimary disconnect_node_and_ensure_disconnected( looper, current_node_set, node, stopNode=False) current_node_set.remove(node) reqs = sdk_signed_random_requests(looper, sdk_wallet_client, 1) with pytest.raises(TimeoutError): sdk_send_and_check(reqs, looper, txnPoolNodeSet, sdk_pool_handle) check_request_is_not_returned_to_nodes( txnPoolNodeSet, sdk_json_to_request_object(json.loads(reqs[0]))) # The following reconnection of nodes is needed in this test to avoid # pytest process hangup for node in faulties: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node)
def test_6_nodes_pool_cannot_reach_quorum_with_2_disconnected( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): ''' Check that we can not reach consensus when more than n-f nodes are disconnected: disconnect 2 of 6 nodes ''' faulties = nodes_by_rank(txnPoolNodeSet)[-faultyNodes:] current_node_set = set(txnPoolNodeSet) for node in faulties: for r in node.replicas.values(): assert not r.isPrimary disconnect_node_and_ensure_disconnected( looper, current_node_set, node, stopNode=False) current_node_set.remove(node) reqs = sdk_signed_random_requests(looper, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): sdk_send_and_check(reqs, looper, txnPoolNodeSet, sdk_pool_handle) check_request_is_not_returned_to_nodes( txnPoolNodeSet, sdk_json_to_request_object(json.loads(reqs[0]))) # The following reconnection of nodes is needed in this test to avoid # pytest process hangup for node in faulties: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node)
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=False) looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet[1:], expected_view_no, retryWait=1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_txns) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, master_node) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:]) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status'][ 'Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status'][ 'View_No'] == expected_view_no node_names = [n.name for n in txnPoolNodeSet[1:]] for node_name in node_names: assert latest_info['Node_info']['View_change_status']['VCDone_queue'][ node_name][0] == master_node.master_primary_name assert latest_info['Node_info']['View_change_status']['VCDone_queue'][ node_name][1] assert latest_info['Node_info']['View_change_status'][ 'Last_complete_view_no'] == expected_view_no
def test_fill_ts_store_after_catchup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) node_to_disconnect = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect, stopNode=False) looper.runFor(2) sdk_replies = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node_to_disconnect) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet) req_handler = node_to_disconnect.getDomainReqHandler() for reply in sdk_replies: key = req_handler.prepare_buy_key(reply[1]['result']['identifier'], reply[1]['result']['reqId']) root_hash = req_handler.ts_store.get_equal_or_prev( reply[1]['result']['txnTime']) assert root_hash from_state = req_handler.state.get_for_root_hash(root_hash=root_hash, key=key) assert req_handler.stateSerializer.deserialize(from_state)['amount'] == \ reply[1]['result']['amount']
def test_current_state_propagation(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ Checks that nodes send CurrentState to lagged nodes. """ # 1. Start pool looper, new_node, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns # 2. Stop one node lagging_node = new_node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(new_node) # 3. Start it again looper.add(new_node) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) looper.runFor(5) # 4. Check that all nodes sent CurrentState for node in txnPoolNodeSet[:-1]: sent_times = node.spylog.count( node.send_current_state_to_lagging_node.__name__) assert sent_times != 0, "{} haven't sent CurrentState".format(node) looper.runFor(5) # 5. Check that it received CurrentState messages received_times = lagging_node.spylog.count( lagging_node.process_current_state_message.__name__) assert received_times != 0
def test_large_catchup(looper, txnPoolNodeSet, wallet1, client1, client1Connected, tconf, allPluginsPath, tdirWithPoolTxns): """ Checks that node can catchup large ledgers """ # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] all_nodes = txnPoolNodeSet looper.run(checkNodesConnected(txnPoolNodeSet)) # Prepare client client, wallet = client1, wallet1 looper.run(client.ensureConnectedToNodes()) # Check that requests executed well sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=10) # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, rest_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, numReqs=100) waitNodeDataEquality(looper, *rest_nodes) # Make message size limit smaller to ensure that catchup response is # larger exceeds the limit for node in rest_nodes: decrease_max_request_size(node) # Restart stopped node and wait for successful catch up looper.add(lagging_node) reconnect_node_and_ensure_connected(looper, all_nodes, lagging_node) waitNodeDataEquality(looper, *all_nodes)
def test_node_catchup_causes_no_desync(looper, txnPoolNodeSet, client1, wallet1, client1Connected, monkeypatch): """ Checks that transactions received by catchup do not break performance monitoring """ client, wallet = client1, wallet1 lagging_node = get_any_non_primary_node(txnPoolNodeSet) rest_nodes = set(txnPoolNodeSet).difference({lagging_node}) # Make master replica lagging by hiding all messages sent to it make_master_replica_lag(lagging_node) monkeypatch.setattr(lagging_node.master_replica, '_request_missing_three_phase_messages', lambda *x, **y: None) # Send some requests and check that all replicas except master executed it sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) waitNodeDataInequality(looper, lagging_node, *rest_nodes) looper.run(eventually(backup_replicas_run_forward, lagging_node)) # Disconnect lagging node, send some more requests and start it back # After start it should fall in a such state that it needs to make catchup disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=False) looper.removeProdable(lagging_node) sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.add(lagging_node) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagging_node) # Check that catchup done waitNodeDataEquality(looper, lagging_node, *rest_nodes) # Send some more requests to ensure that backup and master replicas # are in the same state sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) looper.run(eventually(replicas_synced, lagging_node)) # Check that master is not considered to be degraded assert not lagging_node.monitor.isMasterDegraded()
def testNodeCatchupFPlusOne(txnPoolNodeSet, poolAfterSomeTxns): """ Check that f+1 nodes is enough for catchup """ looper, client, wallet = poolAfterSomeTxns assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}". format(node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node0, stopNode=False) looper.removeProdable(node0) logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Stopping node1 with pool ledger size {}". format(node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node1, stopNode=False) looper.removeProdable(node1) # Make sure new node got out of sync waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2]) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") looper.add(node0) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet[:-1], node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_get_last_ordered_timestamp_after_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward): node_to_disconnect = txnPoolNodeSet[-1] reply_before = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] looper.runFor(2) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect, stopNode=False) reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node_to_disconnect) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet[:-1]) ts_from_state = node_to_disconnect.master_replica._get_last_timestamp_from_state( DOMAIN_LEDGER_ID) assert ts_from_state == reply['result']['txnTime'] assert ts_from_state != reply_before['result']['txnTime']
def test_disconnected_node_catchup_plugin_ledger_txns(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, sdk_new_node_caught_up): """ A node gets disconnected, a few config ledger txns happen, the disconnected node comes back up and catches up the config ledger """ new_node = sdk_new_node_caught_up disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=False) # Do some demo txns; some_demo_txns(looper, sdk_wallet_client, sdk_pool_handle) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
def test_disconnected_node_catchup_config_ledger_txns( looper, some_config_txns_done, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, newNodeCaughtUp, keys): """ A node gets disconnected, a few config ledger txns happen, the disconnected node comes back up and catches up the config ledger """ new_node = newNodeCaughtUp disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=False) # Do some config txns; using a fixture as a method, passing some arguments # as None as they only make sense for the fixture (pre-requisites) send_some_config_txns(looper, sdk_pool_handle, sdk_wallet_client, keys) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1])
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=False) looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet[1:], expected_view_no, retryWait=1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_txns) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, master_node) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:]) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status']['Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status']['View_No'] == expected_view_no node_names = [n.name for n in txnPoolNodeSet[1:]] for node_name in node_names: assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][0] == master_node.master_primary_name assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][1] assert latest_info['Node_info']['View_change_status']['Last_complete_view_no'] == expected_view_no
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) == 0 assert node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) > 0 assert node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_three_phase_messages_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary is not None: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Verifies that a disconnected node with a lagged view accepts the current view from the other nodes on re-connection. Steps: 1. Provoke view change to 1. 2. Ensure that all the nodes complete view change to 1. 3. Disconnect one node from the rest of the nodes in the pool. 4. Provoke view change to 2. 5. Ensure that that all the nodes except for the disconnected one complete view change to 2 and the disconnected node remains in the view 1. 6. Provoke view change to 3. 5. Ensure that that all the nodes except for the disconnected one complete view change to 3 and the disconnected node remains in the view 1. 8. Connect the disconnected node to the rest of the nodes in the pool. 9. Ensure that the re-connected node completes view change to 3. 10. Ensure that all the nodes participate in consensus. """ checkViewNoForNodes(txnPoolNodeSet, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagged_node, stopNode=False) other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 2) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 3) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node) waitForViewChange(looper, [lagged_node], 3, customTimeout=waits.expectedPoolElectionTimeout( len(txnPoolNodeSet))) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 3) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_reconnect_primary_and_not_primary(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle, tconf): """ Test steps: Pool of 7 nodes. count of instances must be 3 1. Choose node, that is not primary on all replicas (3 index) 2. Disconnect them 3. Ensure, that number of replicas was decreased 4. Choose current primary node (must be 0) 5. Disconnect primary 6. Ensure, that view change complete and primary was selected 7. Add node back from 1 step 8. Add node back from 4 step 9. Check, that count of instance (f+1 = 3) 10. Send some requests and check, that pool works. """ restNodes = set(txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) assert txnPoolNodeSet[0].master_replica.isPrimary node_after_all_primary = txnPoolNodeSet[3] # Disconnect node after all primaries (after all backup primaries) disconnect_node_and_ensure_disconnected(looper, restNodes, node_after_all_primary, stopNode=False) # ------------------------------------------------------- restNodes.remove(node_after_all_primary) looper.run( eventually(partial(check_count_connected_node, restNodes, 6), timeout=5, acceptableExceptions=[AssertionError])) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) # Get primary node for backup replica primary_node = txnPoolNodeSet[0] assert primary_node.master_replica.isPrimary old_view_no = checkViewNoForNodes(restNodes, 0) # disconnect primary node disconnect_node_and_ensure_disconnected(looper, restNodes, primary_node, stopNode=False) # ------------------------------------------------------- restNodes.remove(primary_node) looper.run( eventually(partial(check_count_connected_node, restNodes, 5), timeout=5, acceptableExceptions=[AssertionError])) looper.run( eventually(partial(checkViewNoForNodes, restNodes, expectedViewNo=old_view_no + 1), timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("restNodes: {}".format(restNodes)) restNodes.add(node_after_all_primary) # Return back node after all primary reconnect_node_and_ensure_connected(looper, restNodes, node_after_all_primary) looper.run( checkNodesConnected(restNodes, customTimeout=5 * tconf.RETRY_TIMEOUT_RESTRICTED)) looper.run( eventually(partial(check_count_connected_node, restNodes, 6), timeout=5, acceptableExceptions=[AssertionError])) assert len(set([len(n.replicas) for n in restNodes])) == 1 sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) # Return back primary node restNodes.add(primary_node) reconnect_node_and_ensure_connected(looper, restNodes, primary_node) looper.run( checkNodesConnected(restNodes, customTimeout=5 * tconf.RETRY_TIMEOUT_RESTRICTED)) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 1 of 4 nodes goes down. A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, disconnected_node, stopNode=False) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, disconnected_node) # Give time for the reconnected node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica._request_commit) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) == 0 doOrderTimesBefore = disconnected_node.master_replica.spylog.count(Replica.doOrder) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica._request_commit) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) > 0 doOrderTimesAfter = disconnected_node.master_replica.spylog.count(Replica.doOrder) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_and_prepares_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica._request_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica._request_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 1 of 4 nodes goes down. A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, disconnected_node, stopNode=False) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, disconnected_node) # Give time for the reconnected node to catch up if it is going to do it looper.runFor( waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size assert disconnected_node.master_replica.spylog.count( Replica._request_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica._request_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica._request_commit) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) == 0 doOrderTimesBefore = disconnected_node.master_replica.spylog.count( Replica.doOrder) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert disconnected_node.master_replica.spylog.count( Replica._request_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica._request_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica._request_commit) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) > 0 doOrderTimesAfter = disconnected_node.master_replica.spylog.count( Replica.doOrder) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_reconnect_primary_and_not_primary(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle, tconf): """ Test steps: Pool of 7 nodes. count of instances must be 3 1. Choose node, that is not primary on all replicas (3 index) 2. Disconnect them 3. Ensure, that number of replicas was decreased 4. Choose current primary node (must be 0) 5. Disconnect primary 6. Ensure, that view change complete and primary was selected 7. Add node back from 1 step 8. Add node back from 4 step 9. Check, that count of instance (f+1 = 3) 10. Send some requests and check, that pool works. """ restNodes = set(txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) assert txnPoolNodeSet[0].master_replica.isPrimary node_after_all_primary = txnPoolNodeSet[3] # Disconnect node after all primaries (after all backup primaries) disconnect_node_and_ensure_disconnected(looper, restNodes, node_after_all_primary, stopNode=False) # ------------------------------------------------------- restNodes.remove(node_after_all_primary) looper.run(eventually(partial(check_count_connected_node, restNodes, 6), timeout=5, acceptableExceptions=[AssertionError])) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) # Get primary node for backup replica primary_node = txnPoolNodeSet[0] assert primary_node.master_replica.isPrimary old_view_no = checkViewNoForNodes(restNodes, 0) # disconnect primary node disconnect_node_and_ensure_disconnected(looper, restNodes, primary_node, stopNode=False) # ------------------------------------------------------- restNodes.remove(primary_node) looper.run(eventually(partial(check_count_connected_node, restNodes, 5), timeout=5, acceptableExceptions=[AssertionError])) looper.run(eventually(partial(checkViewNoForNodes, restNodes, expectedViewNo=old_view_no + 1), timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("restNodes: {}".format(restNodes)) restNodes.add(node_after_all_primary) # Return back node after all primary reconnect_node_and_ensure_connected(looper, restNodes, node_after_all_primary) looper.run(checkNodesConnected(restNodes, customTimeout=5*tconf.RETRY_TIMEOUT_RESTRICTED)) looper.run(eventually(partial(check_count_connected_node, restNodes, 6), timeout=5, acceptableExceptions=[AssertionError])) assert len(set([len(n.replicas) for n in restNodes])) == 1 sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5) # Return back primary node restNodes.add(primary_node) reconnect_node_and_ensure_connected(looper, restNodes, primary_node) looper.run(checkNodesConnected(restNodes, customTimeout=5*tconf.RETRY_TIMEOUT_RESTRICTED)) sdk_send_random_and_check(looper, restNodes, sdk_pool_handle, sdk_wallet_steward, 5)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Verifies that a disconnected node with a lagged view accepts the current view from the other nodes on re-connection. Steps: 1. Provoke view change to 1. 2. Ensure that all the nodes complete view change to 1. 3. Disconnect one node from the rest of the nodes in the pool. 4. Provoke view change to 2. 5. Ensure that that all the nodes except for the disconnected one complete view change to 2 and the disconnected node remains in the view 1. 6. Provoke view change to 3. 5. Ensure that that all the nodes except for the disconnected one complete view change to 3 and the disconnected node remains in the view 1. 8. Connect the disconnected node to the rest of the nodes in the pool. 9. Ensure that the re-connected node completes view change to 3. 10. Ensure that all the nodes participate in consensus. """ checkViewNoForNodes(txnPoolNodeSet, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagged_node, stopNode=False) other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, numInstances=getRequiredInstances(len(txnPoolNodeSet))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 2) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, numInstances=getRequiredInstances(len(txnPoolNodeSet))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 3) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node) waitForViewChange(looper, [lagged_node], 3, customTimeout=waits.expectedPoolElectionTimeout( len(txnPoolNodeSet))) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 3) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf): """ Verifies that a disconnected node with a lagged view accepts the current view from the other nodes on re-connection. Steps: 1. Provoke view change to 1. 2. Ensure that all the nodes complete view change to 1. 3. Disconnect one node from the rest of the nodes in the pool. 4. Provoke view change to 2. 5. Ensure that all the nodes except for the disconnected one complete view change to 2 and the disconnected node remains in the view 1. 6. Provoke view change to 3. 5. Ensure that all the nodes except for the disconnected one complete view change to 3 and the disconnected node remains in the view 1. 8. Connect the disconnected node to the rest of the nodes in the pool. 9. Ensure that the re-connected node completes view change to 3. 10. Ensure that all the nodes participate in consensus. """ checkViewNoForNodes(txnPoolNodeSet, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagged_node, stopNode=False) other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range( getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 2) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range( getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 3) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node) # The node can start view change, receive NEW_VIEW and start waiting for first ordered in the new view # But since the node is lagged for more than checkpoint, it can not do re-ordering, and has to wait until the # catchjup by checkpoints is started waitForViewChange(looper, [lagged_node], 3, customTimeout=waits.expectedPoolElectionTimeout( len(txnPoolNodeSet))) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=2 * tconf.CHK_FREQ) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 3) ensureElectionsDone(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica._request_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica._request_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)