def test_restarted_node_catches_up_config_ledger_txns( looper, some_config_txns_done, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, sdk_new_node_caught_up, keys, tconf, tdir, allPluginsPath): """ A node is stopped, a few config ledger txns happen, the stopped node is started and catches up the config ledger """ new_node = sdk_new_node_caught_up disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=True) looper.removeProdable(new_node) # Do some config txns; using a fixture as a method, passing some arguments # as None as they only make sense for the fixture (pre-requisites) send_some_config_txns(looper, sdk_pool_handle, sdk_wallet_client, keys) # Make sure new node got out of sync for node in txnPoolNodeSet[:-1]: assert new_node.configLedger.size < node.configLedger.size restarted_node = start_stopped_node(new_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup'])
def test_propagate_primary_after_primary_restart_view_0( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indices correctly case viewNo == 0 """ sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [ i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name ][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_resend_instance_change_messages(looper, txnPoolNodeSet, tconf, sdk_wallet_steward, sdk_pool_handle): primary_node = txnPoolNodeSet[0] old_view_no = checkViewNoForNodes(txnPoolNodeSet, 0) assert primary_node.master_replica.isPrimary for n in txnPoolNodeSet: n.nodeIbStasher.delay(icDelay(3 * tconf.NEW_VIEW_TIMEOUT)) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {0} disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_node, stopNode=False) txnPoolNodeSet.remove(primary_node) looper.run(eventually(partial(check_count_connected_node, txnPoolNodeSet, 4), timeout=5, acceptableExceptions=[AssertionError])) looper.runFor(2 * tconf.NEW_VIEW_TIMEOUT) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {1} looper.runFor(tconf.NEW_VIEW_TIMEOUT) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, expectedViewNo=old_view_no + 1), timeout=tconf.NEW_VIEW_TIMEOUT)) ensureElectionsDone(looper, txnPoolNodeSet) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_get_last_ordered_timestamp_after_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): node_to_disconnect = txnPoolNodeSet[-1] reply_before = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] looper.runFor(2) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet[:-1], exclude_from_check=['check_last_ordered_3pc_backup']) ts_from_state = node_to_disconnect.master_replica._get_last_timestamp_from_state( DOMAIN_LEDGER_ID) assert ts_from_state == get_txn_time(reply['result']) assert ts_from_state != get_txn_time(reply_before['result'])
def test_current_state_propagation(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns, tconf, tdir, allPluginsPath): """ Checks that nodes send CurrentState to lagged nodes. """ # 1. Start pool looper, new_node, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns # 2. Stop one node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=True) looper.removeProdable(new_node) # 3. Start it again restarted_node = start_stopped_node(new_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) looper.runFor(5) # 4. Check that all nodes sent CurrentState for node in txnPoolNodeSet[:-1]: sent_times = node.spylog.count( node.send_current_state_to_lagging_node.__name__) assert sent_times != 0, "{} haven't sent CurrentState".format(node) looper.runFor(5) # 5. Check that it received CurrentState messages received_times = restarted_node.spylog.count( restarted_node.process_current_state_message.__name__) assert received_times != 0
def testNodeCatchupAfterDisconnect(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Stopping node {} with pool ledger size {}". format(newNode, newNode.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, newNode, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Starting the stopped node, {}".format(newNode)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def test_6_nodes_pool_cannot_reach_quorum_with_2_disconnected( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): ''' Check that we can not reach consensus when more than n-f nodes are disconnected: disconnect 2 of 6 nodes ''' faulties = nodes_by_rank(txnPoolNodeSet)[-faultyNodes:] current_node_set = set(txnPoolNodeSet) for node in faulties: for r in node.replicas.values(): assert not r.isPrimary disconnect_node_and_ensure_disconnected( looper, current_node_set, node, stopNode=False) current_node_set.remove(node) reqs = sdk_signed_random_requests(looper, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): sdk_send_and_check(reqs, looper, txnPoolNodeSet, sdk_pool_handle) check_request_is_not_returned_to_nodes( txnPoolNodeSet, sdk_json_to_request_object(json.loads(reqs[0]))) # The following reconnection of nodes is needed in this test to avoid # pytest process hangup for node in faulties: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node)
def testNodeCatchupAfterDisconnect(sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns): """ A node that disconnects after some transactions should eventually get the transactions which happened while it was disconnected :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Disconnecting node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Connecting the stopped node, {}".format(new_node)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, new_node) logger.debug("Waiting for the node to catch up, {}".format(new_node)) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, 10) checkNodeDataForEquality(new_node, *txnPoolNodeSet[:-1])
def test_removed_replica_restored_on_view_change( looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath, chkFreqPatched, view_change): """ 1. Remove replica on some node which is not master primary 2. Reconnect the node which was master primary so far 3. Check that nodes and replicas correctly added """ ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) node = get_last_master_non_primary_node(txnPoolNodeSet) start_replicas_count = node.replicas.num_replicas instance_id = start_replicas_count - 1 node.replicas.remove_replica(instance_id) check_replica_removed(node, start_replicas_count, instance_id) # trigger view change on all nodes master_primary = get_master_primary_node(txnPoolNodeSet) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_primary) txnPoolNodeSet.remove(master_primary) looper.removeProdable(master_primary) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) restarted_node = start_stopped_node(master_primary, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(restarted_node) looper.run(checkNodesConnected(txnPoolNodeSet)) waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) assert start_replicas_count == node.replicas.num_replicas
def test_catchup_with_ledger_statuses_in_old_format_from_one_node( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): """ A node is restarted and during a catch-up receives ledger statuses in an old format (without `protocolVersion`) from one of nodes in the pool. The test verifies that the node successfully completes the catch-up and participates in ordering of further transactions. """ node_to_restart = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] old_node = txnPoolNodeSet[0] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) original_get_ledger_status = old_node.getLedgerStatus # Patch the method getLedgerStatus to # get_ledger_status_without_protocol_version for sending ledger status # in old format (without `protocolVersion`) def get_ledger_status_without_protocol_version(ledgerId: int): original_ledger_status = original_get_ledger_status(ledgerId) return LedgerStatusInOldFormat(original_ledger_status.ledgerId, original_ledger_status.txnSeqNo, original_ledger_status.viewNo, original_ledger_status.ppSeqNo, original_ledger_status.merkleRoot) old_node.getLedgerStatus = get_ledger_status_without_protocol_version # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart) looper.removeProdable(name=node_to_restart.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) # add `node_to_restart` to pool node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_restart looper.run(checkNodesConnected(txnPoolNodeSet)) # Verify that `node_to_restart` successfully completes catch-up waitNodeDataEquality(looper, node_to_restart, *other_nodes) # check discarding ledger statuses from `old_node` for all ledgers assert countDiscarded(node_to_restart, 'replied message has invalid structure') >= 3 # Verify that `node_to_restart` participates in ordering # of further transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) waitNodeDataEquality(looper, node_to_restart, *other_nodes)
def test_resend_instance_change_messages(looper, txnPoolNodeSet, tconf, sdk_wallet_steward, sdk_pool_handle): primary_node = txnPoolNodeSet[0] old_view_no = checkViewNoForNodes(txnPoolNodeSet, 0) assert primary_node.master_replica.isPrimary for n in txnPoolNodeSet: n.nodeIbStasher.delay(icDelay(3 * tconf.INSTANCE_CHANGE_TIMEOUT)) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {0} disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_node, stopNode=False) txnPoolNodeSet.remove(primary_node) looper.run(eventually(partial(check_count_connected_node, txnPoolNodeSet, 4), timeout=5, acceptableExceptions=[AssertionError])) looper.runFor(2*tconf.INSTANCE_CHANGE_TIMEOUT) assert set([n.view_changer.instance_change_rounds for n in txnPoolNodeSet]) == {1} looper.runFor(tconf.INSTANCE_CHANGE_TIMEOUT) looper.run(eventually(partial(checkViewNoForNodes, txnPoolNodeSet, expectedViewNo=old_view_no + 1), timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_restarted_node_catches_up_config_ledger_txns(looper, some_config_txns_done, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, sdk_new_node_caught_up, keys, tconf, tdir, allPluginsPath): """ A node is stopped, a few config ledger txns happen, the stopped node is started and catches up the config ledger """ new_node = sdk_new_node_caught_up disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, new_node, stopNode=True) looper.removeProdable(new_node) # Do some config txns; using a fixture as a method, passing some arguments # as None as they only make sense for the fixture (pre-requisites) send_some_config_txns(looper, sdk_pool_handle, sdk_wallet_client, keys) # Make sure new node got out of sync for node in txnPoolNodeSet[:-1]: assert new_node.configLedger.size < node.configLedger.size restarted_node = start_stopped_node(new_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1])
def test_not_set_H_as_maxsize_for_backup_if_is_primary(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) primary_on_backup = txnPoolNodeSet[2] assert primary_on_backup.replicas._replicas[1].isPrimary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, primary_on_backup, stopNode=True) looper.removeProdable(primary_on_backup) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, LOG_SIZE) restarted_node = start_stopped_node(primary_on_backup, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[2] = restarted_node ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.VIEW_CHANGE_TIMEOUT) assert restarted_node.replicas._replicas[1].isPrimary assert restarted_node.replicas._replicas[1].h == 0 assert restarted_node.replicas._replicas[1].H == LOG_SIZE
def test_resend_inst_ch_in_progress_v_ch(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): old_view = viewNoForNodes(txnPoolNodeSet) # disconnect two nodes. One of them should be next master primary in case of view change. for node in [txnPoolNodeSet[1], txnPoolNodeSet[-1]]: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=True) looper.removeProdable(node) txnPoolNodeSet.remove(node) # delay I_CH on every node except last one and initiate view change stashers = [n.nodeIbStasher for n in txnPoolNodeSet[:-1]] with delay_rules_without_processing(stashers, icDelay(viewNo=2)): ensure_view_change(looper, txnPoolNodeSet) looper.runFor(tconf.VIEW_CHANGE_TIMEOUT + 1) # checks def checks(): assert all(not node.view_change_in_progress for node in txnPoolNodeSet) assert all(node.viewNo == old_view + 2 for node in txnPoolNodeSet) looper.run( eventually(checks, timeout=tconf.INSTANCE_CHANGE_RESEND_TIMEOUT * 1.5, retryWait=1)) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_6_nodes_pool_cannot_reach_quorum_with_2_disconnected( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): ''' Check that we can not reach consensus when more than n-f nodes are disconnected: disconnect 2 of 6 nodes ''' faulties = nodes_by_rank(txnPoolNodeSet)[-faultyNodes:] current_node_set = set(txnPoolNodeSet) for node in faulties: for r in node.replicas: assert not r.isPrimary disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) reqs = sdk_signed_random_requests(looper, sdk_wallet_client, 1) with pytest.raises(PoolLedgerTimeoutException): sdk_send_and_check(reqs, looper, txnPoolNodeSet, sdk_pool_handle) check_request_is_not_returned_to_nodes( txnPoolNodeSet, sdk_json_to_request_object(json.loads(reqs[0]))) # The following reconnection of nodes is needed in this test to avoid # pytest process hangup for node in faulties: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node)
def test_get_last_ordered_timestamp_after_catchup(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): node_to_disconnect = txnPoolNodeSet[-1] reply_before = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] looper.runFor(2) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) reply = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1)[0][1] node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet[:-1]) ts_from_state = node_to_disconnect.master_replica._get_last_timestamp_from_state(DOMAIN_LEDGER_ID) assert ts_from_state == get_txn_time(reply['result']) assert ts_from_state != get_txn_time(reply_before['result'])
def test_vc_by_current_state(txnPoolNodeSet, looper, tdir, tconf, allPluginsPath): node_to_stop = txnPoolNodeSet[-1] old_view_no = node_to_stop.view_changer.last_completed_view_no disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) ensure_view_change(looper, txnPoolNodeSet[:-1]) ensureElectionsDone(looper, txnPoolNodeSet[:-1], customTimeout=tconf.VIEW_CHANGE_TIMEOUT) new_view_no = txnPoolNodeSet[0].view_changer.last_completed_view_no assert new_view_no > old_view_no node_to_stop = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_stop ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=tconf.VIEW_CHANGE_TIMEOUT) assert node_to_stop.view_changer.last_completed_view_no == new_view_no
def test_idr_cache_update_after_catchup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): wallet_handle, identifier = sdk_wallet_steward node_to_disconnect = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect.name, stopNode=True) looper.removeProdable(node_to_disconnect) idr, verkey = createHalfKeyIdentifierAndAbbrevVerkey() request = looper.loop.run_until_complete(build_nym_request(identifier, idr, verkey, None, None)) req_signed = looper.loop.run_until_complete(sign_request(wallet_handle, identifier, request)) result = json.loads(looper.loop.run_until_complete(submit_request(sdk_pool_handle, req_signed))) restarted_node = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = restarted_node waitNodeDataEquality(looper, restarted_node, *txnPoolNodeSet[:-1]) req_handler = restarted_node.getDomainReqHandler() root_hash = req_handler.ts_store.get_equal_or_prev(get_txn_time(result['result'])) key = domain.make_state_path_for_nym(idr) from_state = req_handler.state.get_for_root_hash(root_hash=root_hash, key=key) assert from_state deserialized = req_handler.stateSerializer.deserialize(from_state) assert deserialized items_after = req_handler.idrCache.get(idr) assert items_after
def test_fill_ts_store_after_catchup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) node_to_disconnect = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_replies = sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup']) req_handler = node_to_disconnect.read_manager.request_handlers[GET_BUY] for reply in sdk_replies: key = BuyHandler.prepare_buy_key(get_from(reply[1]['result']), get_req_id(reply[1]['result'])) root_hash = req_handler.database_manager.ts_store.get_equal_or_prev( get_txn_time(reply[1]['result'])) assert root_hash from_state = req_handler.state.get_for_root_hash(root_hash=root_hash, key=key) assert domain_state_serializer.deserialize(from_state)['amount'] == \ get_payload_data(reply[1]['result'])['amount']
def test_restarted_node_complete_vc_by_current_state(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath): node_to_restart = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart, stopNode=True) looper.removeProdable(node_to_restart) old_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) print(old_completed_view_no) ensure_view_change(looper, txnPoolNodeSet[:-1]) ensureElectionsDone(looper, txnPoolNodeSet[:-1], customTimeout=tconf.NEW_VIEW_TIMEOUT) current_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) assert current_completed_view_no > old_completed_view_no print(current_completed_view_no) # Delay VIEW_CHANGE_DONE messages for all nodes for node in txnPoolNodeSet[:-1]: node.nodeIbStasher.delay(nv_delay(1000)) ensure_view_change(looper, txnPoolNodeSet[:-1]) # Start stopped node until other nodes do view_change node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) node_to_restart.nodeIbStasher.delay(nv_delay(1000)) # check, that restarted node use last completed view no from pool, instead of proposed looper.run( eventually(complete_propagate_primary, node_to_restart, current_completed_view_no, timeout=tconf.NEW_VIEW_TIMEOUT))
def test_restarted_node_complete_vc_by_current_state(looper, txnPoolNodeSet, tconf, tdir, allPluginsPath): node_to_restart = txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart, stopNode=True) looper.removeProdable(node_to_restart) old_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) ensure_view_change(looper, txnPoolNodeSet[:-1]) ensureElectionsDone(looper, txnPoolNodeSet[:-1], customTimeout=tconf.VIEW_CHANGE_TIMEOUT) current_completed_view_no = get_last_completed_view_no(txnPoolNodeSet[:-1]) assert current_completed_view_no > old_completed_view_no # Delay VIEW_CHANGE_DONE messages for all nodes for node in txnPoolNodeSet[:-1]: node.nodeIbStasher.delay(vcd_delay(1000)) ensure_view_change(looper, txnPoolNodeSet[:-1]) # Start stopped node until other nodes do view_change node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) node_to_restart.nodeIbStasher.delay(vcd_delay(1000)) # check, that restarted node use last completed view no from pool, instead of proposed looper.run(eventually(complete_propagate_primary, node_to_restart, current_completed_view_no, timeout=tconf.VIEW_CHANGE_TIMEOUT))
def testNodeCatchupAfterLostConnection(newNodeCaughtUp, txnPoolNodeSet, nodeSetWithNodeAddedAfterSomeTxns): """ A node that has poor internet connection and got unsynced after some transactions should eventually get the transactions which happened while it was not accessible :return: """ looper, newNode, client, wallet, _, _ = nodeSetWithNodeAddedAfterSomeTxns logger.debug("Disconnecting node {}, ledger size {}".format( newNode, newNode.domainLedger.size)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, newNode, stopNode=False) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) # Make sure new node got out of sync waitNodeDataInequality(looper, newNode, *txnPoolNodeSet[:-1]) # logger.debug("Ensure node {} gets disconnected".format(newNode)) ensure_node_disconnected(looper, newNode, txnPoolNodeSet[:-1]) logger.debug("Connecting the node {} back, ledger size {}".format( newNode, newNode.domainLedger.size)) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, newNode) logger.debug("Waiting for the node to catch up, {}".format(newNode)) waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 10) checkNodeDataForEquality(newNode, *txnPoolNodeSet[:-1])
def test_catchup_with_one_slow_node(tdir, tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, allPluginsPath, logsearch): ''' 1. Stop the node Delta 2. Order 9 txns. In sending CatchupReq in a first round every node [Alpha, Beta, Gamma] will receive request for 3 txns. 3. Delay CatchupReq messages on Alpha 4. Start Delta 5. Check that all nodes have equality data. 6. Check that Delta re-ask CatchupRep only once. In the second CatchupRep (first re-ask) Delta shouldn't request CatchupRep from Alpha because it didn't answer early. If the behavior is wrong and Delta re-ask txns form all nodes, every node will receive request for 1 txns, Alpha will not answer and Delta will need a new re-ask round. ''' # Prepare nodes lagging_node = txnPoolNodeSet[-1] rest_nodes = txnPoolNodeSet[:-1] # Stop one node waitNodeDataEquality(looper, lagging_node, *rest_nodes) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Send more requests to active nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, len(rest_nodes) * 3) waitNodeDataEquality(looper, *rest_nodes) # Restart stopped node and wait for successful catch up lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) log_re_ask, _ = logsearch( msgs=['requesting .* missing transactions after timeout']) old_re_ask_count = len(log_re_ask) # Delay CatchupRep messages on Alpha with delay_rules(rest_nodes[0].nodeIbStasher, cqDelay()): with delay_rules(lagging_node.nodeIbStasher, cs_delay()): looper.add(lagging_node) txnPoolNodeSet[-1] = lagging_node looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, *txnPoolNodeSet, customTimeout=120) assert len( log_re_ask ) - old_re_ask_count == 2 # for audit and domain ledgers
def test_node_load_after_add_then_disconnect(sdk_new_node_caught_up, txnPoolNodeSet, tconf, looper, sdk_pool_handle, sdk_wallet_client, tdirWithPoolTxns, allPluginsPath, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = sdk_new_node_caught_up with capsys.disabled(): print("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, txns_per_batch) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s)) with capsys.disabled(): print("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) new_node = TestNode( new_node.name, basedirpath=tdirWithPoolTxns, base_data_dir=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node # Delay catchup reply processing so LedgerState does not change delay_catchup_reply = 5 new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], customTimeout=timeout) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def testNodeCatchupFPlusOne(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}".format( node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("Stopping node1 with pool ledger size {}".format( node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync # Excluding state check since the node is stopped hence the state db is closed waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2], exclude_from_check=['check_state']) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_client): """ Check that quorum f + 1 is used for primary selection when initiated by CurrentState messages. Assumes that view change quorum is n - f. Assumes that primaries selection in round robin fashion. """ # Ensure that we have 4 nodes in total all_nodes = list(txnPoolNodeSet) assert 4 == len(all_nodes) alpha, beta, delta, gamma = all_nodes initial_view_no = alpha.viewNo # Make one node lagging by switching it off for some time lagging_node = gamma non_lagging_nodes = [alpha, beta, delta] disconnect_node_and_ensure_disconnected(looper, all_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Make nodes to perform view change ensure_view_change(looper, non_lagging_nodes) ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, instances_list=range(2)) ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes) # Stop two more of active nodes # (but not primary, which is Beta (because of round robin selection)) stopped_nodes = [alpha] # TODO: add one more here for stopped_node in stopped_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, stopped_node, stopNode=True) looper.removeProdable(stopped_node) # Start lagging node back restarted_node = start_stopped_node(lagging_node, looper, tconf, tdir, allPluginsPath) active_nodes = [beta, delta, restarted_node] # Check that primary selected expected_view_no = initial_view_no + 1 ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_catchup_with_ledger_statuses_in_old_format_from_one_node( txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): """ A node is restarted and during a catch-up receives ledger statuses in an old format (without `protocolVersion`) from one of nodes in the pool. The test verifies that the node successfully completes the catch-up and participates in ordering of further transactions. """ node_to_restart = txnPoolNodeSet[-1] other_nodes = txnPoolNodeSet[:-1] old_node = txnPoolNodeSet[0] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) original_get_ledger_status = old_node.getLedgerStatus # Patch the method getLedgerStatus to # get_ledger_status_without_protocol_version for sending ledger status # in old format (without `protocolVersion`) def get_ledger_status_without_protocol_version(ledgerId: int): original_ledger_status = original_get_ledger_status(ledgerId) return LedgerStatusInOldFormat(original_ledger_status.ledgerId, original_ledger_status.txnSeqNo, original_ledger_status.viewNo, original_ledger_status.ppSeqNo, original_ledger_status.merkleRoot) old_node.getLedgerStatus = get_ledger_status_without_protocol_version # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_restart) looper.removeProdable(name=node_to_restart.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) # add `node_to_restart` to pool node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_restart looper.run(checkNodesConnected(txnPoolNodeSet)) # Verify that `node_to_restart` successfully completes catch-up waitNodeDataEquality(looper, node_to_restart, *other_nodes) # check discarding ledger statuses from `old_node` for all ledgers assert countDiscarded(node_to_restart, 'replied message has invalid structure') >= 3 # Verify that `node_to_restart` participates in ordering # of further transactions sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) waitNodeDataEquality(looper, node_to_restart, *other_nodes)
def test_node_requests_missing_three_phase_messages(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=False) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def stop_primary(looper, active_nodes): stopped_node = active_nodes[0] disconnect_node_and_ensure_disconnected(looper, active_nodes, stopped_node, stopNode=True) looper.removeProdable(stopped_node) active_nodes = active_nodes[1:] return stopped_node, active_nodes
def stop_nodes(looper, nodeSet): faulties = nodeSet.nodes_by_rank[-faultyNodes:] for node in faulties: for r in node.replicas: assert not r.isPrimary disconnect_node_and_ensure_disconnected( looper, nodeSet, node, stopNode=False) looper.removeProdable(node) return adict(faulties=faulties)
def stop_primary(looper, active_nodes): stopped_node = active_nodes[0] disconnect_node_and_ensure_disconnected(looper, active_nodes, stopped_node, stopNode=True) looper.removeProdable(stopped_node) active_nodes = active_nodes[1:] return stopped_node, active_nodes
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) # After node catches up it set view_no from audit ledger and do not need to do view_change assert len( getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer.start_view_change, compare_val_to=True)) == 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def start_stop_one_node(node_to_restart, pool_of_nodes): """ :param node_to_restart: node, which would be restarted :param pool_of_nodes: current pool :return: new pool with restarted node Node restart procedure consist of: 1. Calling stop() 2. Remove from looper and pool 3. Create new instance of node with the same ha, cliha and node_name (also all path to data, keys and etc would be exactly as for stopped node) 4. Add new instance into looper and pool 5. Check, that other nodes accepted new instance and all pool has the same data """ remaining_nodes = list(set(pool_of_nodes) - {node_to_restart}) disconnect_node_and_ensure_disconnected(looper, pool_of_nodes, node_to_restart, stopNode=True) looper.removeProdable(node_to_restart) ensure_all_nodes_have_same_data(looper, remaining_nodes, custom_timeout=tconf.NEW_VIEW_TIMEOUT) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 1) node_to_restart = start_stopped_node(node_to_restart, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=True) pool_of_nodes = remaining_nodes + [node_to_restart] looper.run(checkNodesConnected(pool_of_nodes)) ensure_all_nodes_have_same_data( looper, pool_of_nodes, custom_timeout=tconf.NEW_VIEW_TIMEOUT, exclude_from_check=['check_last_ordered_3pc_backup']) timeout = waits.expectedPoolCatchupTime(nodeCount=len(pool_of_nodes)) looper.run( eventually(check_ledger_state, node_to_restart, DOMAIN_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) looper.run( eventually(check_ledger_state, node_to_restart, POOL_LEDGER_ID, LedgerState.synced, retryWait=.5, timeout=timeout)) looper.run(eventually(catchuped, node_to_restart, timeout=2 * timeout)) return pool_of_nodes
def test_catchup_with_lost_first_consistency_proofs(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath, monkeypatch, lost_count): '''Skip processing of first lost_count CONSISTENCY_PROOFs in catchup. In this case catchup node has no quorum with f+1 CONSISTENCY_PROOFs for the longer transactions list. It need to request CONSISTENCY_PROOFs again and finishes catchup. Test makes sure that the node eventually finishes catchup''' node_to_disconnect = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) nodeHa, nodeCHa = HA(*node_to_disconnect.nodestack.ha), HA( *node_to_disconnect.clientstack.ha) config_helper = PNodeConfigHelper(node_to_disconnect.name, tconf, chroot=tdir) node_to_disconnect = TestNode(node_to_disconnect.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) def unpatch_after_call(proof, frm): global call_count call_count += 1 if call_count >= lost_count: # unpatch processConsistencyProof after lost_count calls node_to_disconnect.nodeMsgRouter.add((ConsistencyProof, node_to_disconnect.ledgerManager.processConsistencyProof)) call_count = 0 # patch processConsistencyProof node_to_disconnect.nodeMsgRouter.add((ConsistencyProof, unpatch_after_call)) # add node_to_disconnect to pool looper.add(node_to_disconnect) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def test_view_not_changed_when_short_disconnection(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ When primary is disconnected but not long enough to trigger the timeout, view change should not happen """ pr_node = get_master_primary_node(txnPoolNodeSet) view_no = checkViewNoForNodes(txnPoolNodeSet) prp_inst_chg_calls = { node.name: node.spylog.count(node.propose_view_change.__name__) for node in txnPoolNodeSet if node != pr_node } recv_inst_chg_calls = { node.name: node.spylog.count( node.view_changer.process_instance_change_msg.__name__) for node in txnPoolNodeSet if node != pr_node } # Disconnect master's primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, pr_node, timeout=2) txnPoolNodeSet.remove(pr_node) looper.removeProdable(name=pr_node.name) timeout = min(tconf.ToleratePrimaryDisconnection - 1, 1) # Reconnect master's primary pr_node = start_stopped_node(pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(pr_node) def chk2(): # Schedule an instance change but do not send it # since primary joins again for node in txnPoolNodeSet: if node != pr_node: assert node.spylog.count(node.propose_view_change.__name__ ) > prp_inst_chg_calls[node.name] assert node.view_changer.spylog.count(node.view_changer.process_instance_change_msg.__name__) == \ recv_inst_chg_calls[node.name] looper.run(eventually(chk2, retryWait=.2, timeout=timeout + 1)) assert checkViewNoForNodes(txnPoolNodeSet) == view_no # Send some requests and make sure the request execute sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5)
def test_catchup_with_lost_first_consistency_proofs(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath, monkeypatch, lost_count): '''Skip processing of first lost_count CONSISTENCY_PROOFs in catchup. In this case catchup node has no quorum with f+1 CONSISTENCY_PROOFs for the longer transactions list. It need to request CONSISTENCY_PROOFs again and finishes catchup. Test makes sure that the node eventually finishes catchup''' node_to_disconnect = txnPoolNodeSet[-1] def unpatch_after_call(proof, frm): global call_count call_count += 1 if call_count >= lost_count: # unpatch processConsistencyProof after lost_count calls monkeypatch.undo() call_count = 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) nodeHa, nodeCHa = HA(*node_to_disconnect.nodestack.ha), HA( *node_to_disconnect.clientstack.ha) config_helper = PNodeConfigHelper(node_to_disconnect.name, tconf, chroot=tdir) node_to_disconnect = TestNode(node_to_disconnect.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) # patch processConsistencyProof monkeypatch.setattr(node_to_disconnect.ledgerManager, 'processConsistencyProof', unpatch_after_call) # add node_to_disconnect to pool looper.add(node_to_disconnect) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet)
def testViewChangesIfMasterPrimaryDisconnected(txnPoolNodeSet, looper, wallet1, client1, client1Connected, tconf, tdirWithPoolTxns, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdirWithPoolTxns, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run( eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=10)) assert len( getAllReturnVals(old_pr_node, old_pr_node._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node._next_view_indications
def restart_node(node, looper, txnPoolNodeSet, tdir, tconf, allPluginsPath, timeout): disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node) txnPoolNodeSet.remove(node) looper.removeProdable(name=node.name) looper.runFor(timeout) node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(node) return node
def testNodeCatchupFPlusOne(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}". format(node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) logger.debug("Stopping node1 with pool ledger size {}". format(node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected( looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync # Excluding state check since the node is stopped hence the state db is closed waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2], exclude_from_check=['check_state']) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def test_catchup_with_lost_ledger_status(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath, monkeypatch, lost_count): '''Skip processing of lost_count Message Responses with LEDGER STATUS in catchup; test makes sure that the node eventually finishes catchup''' node_to_disconnect = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) nodeHa, nodeCHa = HA(*node_to_disconnect.nodestack.ha), HA( *node_to_disconnect.clientstack.ha) config_helper = PNodeConfigHelper(node_to_disconnect.name, tconf, chroot=tdir) node_to_disconnect = TestNode(node_to_disconnect.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) def unpatch_after_call(status, frm): global call_count call_count += 1 if call_count >= lost_count: # unpatch processLedgerStatus after lost_count calls node_to_disconnect.nodeMsgRouter.add((LedgerStatus, node_to_disconnect.ledgerManager.processLedgerStatus)) call_count = 0 # patch processLedgerStatus node_to_disconnect.nodeMsgRouter.add((LedgerStatus, unpatch_after_call)) # add node_to_disconnect to pool looper.add(node_to_disconnect) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet, exclude_from_check=['check_last_ordered_3pc_backup'])
def testNodeCatchupFPlusOne(txnPoolNodeSet, poolAfterSomeTxns, tconf, tdir, tdirWithPoolTxns, allPluginsPath, testNodeClass): """ Check that f+1 nodes is enough for catchup """ looper, client, wallet = poolAfterSomeTxns assert len(txnPoolNodeSet) == 4 node1 = txnPoolNodeSet[-1] node0 = txnPoolNodeSet[-2] logger.debug("Stopping node0 with pool ledger size {}".format( node0.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node0, stopNode=True) looper.removeProdable(node0) logger.debug("Sending requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) logger.debug("Stopping node1 with pool ledger size {}".format( node1.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node1, stopNode=True) looper.removeProdable(node1) # Make sure new node got out of sync waitNodeDataInequality(looper, node0, *txnPoolNodeSet[:-2]) # TODO: Check if the node has really stopped processing requests? logger.debug("Starting the stopped node0") nodeHa, nodeCHa = HA(*node0.nodestack.ha), HA(*node0.clientstack.ha) config_helper = PNodeConfigHelper(node0.name, tconf, chroot=tdir) node0 = testNodeClass(node0.name, config_helper=config_helper, ha=nodeHa, cliha=nodeCHa, config=tconf, pluginPaths=allPluginsPath) looper.add(node0) logger.debug("Waiting for the node0 to catch up") waitNodeDataEquality(looper, node0, *txnPoolNodeSet[:-2]) logger.debug("Sending more requests") sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 2) checkNodeDataForEquality(node0, *txnPoolNodeSet[:-2])
def scenario_txns_during_catchup( looper, tconf, tdir, allPluginsPath, do_post_node_creation, nodes, send_txns ): lagging_node = nodes[-1] rest_nodes = nodes[:-1] # Stop NodeX lagging_node.cleanupOnStopping = False disconnect_node_and_ensure_disconnected(looper, nodes, lagging_node.name, stopNode=True) looper.removeProdable(name=lagging_node.name) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes) # Start NodeX lagging_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath, start=False, ) do_post_node_creation(lagging_node) HelperNode.fill_auth_map_for_node(lagging_node, XFER_PUBLIC) HelperNode.fill_auth_map_for_node(lagging_node, NYM) nodes[-1] = lagging_node # Delay CathupRep for DOMAIN ledger for NodeX with delay_rules( lagging_node.nodeIbStasher, cr_delay(ledger_filter=DOMAIN_LEDGER_ID) ): # allow started node to receive looper events looper.add(lagging_node) # ensure it connected to others looper.run(checkNodesConnected(nodes)) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes) # Reset delays # Make sure that all nodes have equal state ensure_all_nodes_have_same_data(looper, nodes) # Send transactions send_txns() ensure_all_nodes_have_same_data(looper, rest_nodes)
def test_propagate_primary_after_primary_restart_view_1( looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward, tdir, allPluginsPath): """ Delay instance change msgs to prevent view change during primary restart to test propagate primary for primary node. ppSeqNo should be > 0 to be able to check that propagate primary restores all indices correctly case viewNo > 0 """ ensure_view_change(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, expectedViewNo=1) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) old_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (old_ppseqno > 0) old_viewNo = checkViewNoForNodes(txnPoolNodeSet) old_primary = get_master_primary_node(txnPoolNodeSet) delay_instance_change(txnPoolNodeSet, IC_DELAY_SEC) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_primary, stopNode=True) looper.removeProdable(old_primary) logger.info("Restart node {}".format(old_primary)) restartedNode = start_stopped_node(old_primary, looper, tconf, tdir, allPluginsPath, delay_instance_change_msgs=False) idx = [i for i, n in enumerate(txnPoolNodeSet) if n.name == restartedNode.name][0] txnPoolNodeSet[idx] = restartedNode restartedNode.nodeIbStasher.delay(icDelay(IC_DELAY_SEC)) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) new_viewNo = checkViewNoForNodes(txnPoolNodeSet) assert (new_viewNo == old_viewNo) new_primary = get_master_primary_node(txnPoolNodeSet) assert (new_primary.name == old_primary.name) # check ppSeqNo the same _get_ppseqno(txnPoolNodeSet) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_steward, sdk_pool_handle) new_ppseqno = _get_ppseqno(txnPoolNodeSet) assert (new_ppseqno > old_ppseqno)
def test_catchup_with_lost_ledger_status(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath, monkeypatch, lost_count): '''Skip processing of lost_count Message Responses with LEDGER STATUS in catchup; test makes sure that the node eventually finishes catchup''' node_to_disconnect = txnPoolNodeSet[-1] def unpatch_after_call(status, frm): global call_count call_count += 1 if call_count >= lost_count: # unpatch processLedgerStatus after lost_count calls monkeypatch.undo() call_count = 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) nodeHa, nodeCHa = HA(*node_to_disconnect.nodestack.ha), HA( *node_to_disconnect.clientstack.ha) config_helper = PNodeConfigHelper(node_to_disconnect.name, tconf, chroot=tdir) node_to_disconnect = TestNode(node_to_disconnect.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) # patch processLedgerStatus monkeypatch.setattr(node_to_disconnect.ledgerManager, 'processLedgerStatus', unpatch_after_call) # add node_to_disconnect to pool looper.add(node_to_disconnect) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet)
def ensure_view_change_by_primary_restart(looper, nodes, tconf, tdirWithPoolTxns, allPluginsPath, customTimeout=None, exclude_from_check=None): """ This method stops current primary for a while to force a view change Returns new set of nodes """ old_view_no = checkViewNoForNodes(nodes) primaryNode = [node for node in nodes if node.has_master_primary][0] logger.debug("Disconnect current primary node {} from others, " "current viewNo {}".format(primaryNode, old_view_no)) disconnect_node_and_ensure_disconnected(looper, nodes, primaryNode, stopNode=True) looper.removeProdable(primaryNode) remainingNodes = list(set(nodes) - {primaryNode}) logger.debug("Waiting for viewNo {} for nodes {}" "".format(old_view_no + 1, remainingNodes)) timeout = customTimeout or waits.expectedPoolViewChangeStartedTimeout( len(remainingNodes)) + nodes[0].config.ToleratePrimaryDisconnection looper.run( eventually(checkViewNoForNodes, remainingNodes, old_view_no + 1, retryWait=1, timeout=timeout)) logger.debug("Starting stopped ex-primary {}".format(primaryNode)) restartedNode = start_stopped_node(primaryNode, looper, tconf, tdirWithPoolTxns, allPluginsPath, delay_instance_change_msgs=False) nodes = remainingNodes + [restartedNode] logger.debug("Ensure all nodes are connected") looper.run(checkNodesConnected(nodes)) logger.debug("Ensure all nodes have the same data") ensure_all_nodes_have_same_data(looper, nodes=nodes, exclude_from_check=exclude_from_check) return nodes
def restart_node(looper, txnPoolNodeSet, node_to_disconnect, tconf, tdir, allPluginsPath): idx = txnPoolNodeSet.index(node_to_disconnect) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) # add node_to_disconnect to pool node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[idx] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet)
def _restart_node(looper, txnPoolNodeSet, node_to_disconnect, tconf, tdir, allPluginsPath): idx = txnPoolNodeSet.index(node_to_disconnect) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) # add node_to_disconnect to pool node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[idx] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet)
def testNodesComingUpAtDifferentTimes(allPluginsPath, tconf, tdir, tdir_for_func, tconf_for_func, looper, txnPoolNodeSetNotStarted): console = getConsole() console.reinit(flushy=True, verbosity=console.Wordage.verbose) nodes = txnPoolNodeSetNotStarted names = list(node.name for node in nodes) shuffle(names) waits = [randint(1, 10) for _ in names] rwaits = [randint(1, 10) for _ in names] for node in nodes: tellKeysToOthers(node, nodes) for i, node in enumerate(nodes): looper.add(node) looper.runFor(waits[i]) looper.run(checkNodesConnected(nodes)) logger.debug("connects") logger.debug("node order: {}".format(names)) logger.debug("waits: {}".format(waits)) current_node_set = set(nodes) for node in nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, timeout=len(nodes), stopNode=True) looper.removeProdable(node) current_node_set.remove(node) for i, node in enumerate(nodes): restarted_node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) current_node_set.add(restarted_node) looper.runFor(rwaits[i]) looper.runFor(3) looper.run(checkNodesConnected(current_node_set)) stopNodes(current_node_set, looper) logger.debug("reconnects") logger.debug("node order: {}".format(names)) logger.debug("rwaits: {}".format(rwaits)) for node in current_node_set: looper.removeProdable(node)
def test_selection_f_plus_one_quorum(looper, txnPoolNodeSet, allPluginsPath, tdir, tconf, sdk_pool_handle, sdk_wallet_client): """ Check that quorum f + 1 is used for primary selection when initiated by CurrentState messages. Assumes that view change quorum is n - f. Assumes that primaries selection in round robin fashion. """ # Ensure that we have 4 nodes in total all_nodes = list(txnPoolNodeSet) assert 4 == len(all_nodes) alpha, beta, delta, gamma = all_nodes initial_view_no = alpha.viewNo # Make one node lagging by switching it off for some time lagging_node = gamma non_lagging_nodes = [alpha, beta, delta] disconnect_node_and_ensure_disconnected(looper, all_nodes, lagging_node, stopNode=True) looper.removeProdable(lagging_node) # Make nodes to perform view change ensure_view_change(looper, non_lagging_nodes) ensureElectionsDone(looper=looper, nodes=non_lagging_nodes, instances_list=range(2)) ensure_all_nodes_have_same_data(looper, nodes=non_lagging_nodes) # Stop two more of active nodes # (but not primary, which is Beta (because of round robin selection)) stopped_nodes = [alpha] # TODO: add one more here for stopped_node in stopped_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, stopped_node, stopNode=True) looper.removeProdable(stopped_node) # Start lagging node back restarted_node = start_stopped_node( lagging_node, looper, tconf, tdir, allPluginsPath) active_nodes = [beta, delta, restarted_node] # Check that primary selected expected_view_no = initial_view_no + 1 ensureElectionsDone(looper=looper, nodes=active_nodes, instances_list=range(2), customTimeout=30) waitForViewChange(looper, active_nodes, expectedViewNo=expected_view_no) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1)
def test_validator_info_file_pool_fields_valid(looper, info, txnPoolNodesLooper, txnPoolNodeSet, node): assert info['Pool_info']['Reachable_nodes_count'] == nodeCount assert info['Pool_info']['Reachable_nodes'] == [("Alpha", 0), ("Beta", 1), ("Delta", None), ("Epsilon", None), ("Gamma", None)] assert info['Pool_info']['Unreachable_nodes_count'] == 0 assert info['Pool_info']['Unreachable_nodes'] == [] assert info['Pool_info']['Total_nodes_count'] == nodeCount others, disconnected = txnPoolNodeSet[:-1], txnPoolNodeSet[-1] disconnect_node_and_ensure_disconnected(txnPoolNodesLooper, txnPoolNodeSet, disconnected) latest_info = node._info_tool.info assert latest_info['Pool_info']['Reachable_nodes_count'] == nodeCount - 1 assert latest_info['Pool_info']['Reachable_nodes'] == [("Alpha", 0), ("Beta", 1), ("Delta", None), ("Gamma", None)] assert latest_info['Pool_info']['Unreachable_nodes_count'] == 1 assert latest_info['Pool_info']['Unreachable_nodes'] == [("Epsilon", None)] assert latest_info['Pool_info']['Total_nodes_count'] == nodeCount
def test_view_changes_if_master_primary_disconnected(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client, tdir, tconf, allPluginsPath): """ View change occurs when master's primary is disconnected """ # Setup nodes = txnPoolNodeSet old_view_no = checkViewNoForNodes(nodes) old_pr_node = get_master_primary_node(nodes) # Stop primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, old_pr_node, stopNode=True) looper.removeProdable(old_pr_node) remaining_nodes = list(set(nodes) - {old_pr_node}) # Sometimes it takes time for nodes to detect disconnection ensure_node_disconnected(looper, old_pr_node, remaining_nodes, timeout=20) looper.runFor(tconf.ToleratePrimaryDisconnection + 2) # Give some time to detect disconnection and then verify that view has # changed and new primary has been elected waitForViewChange(looper, remaining_nodes, old_view_no + 1) ensure_all_nodes_have_same_data(looper, nodes=remaining_nodes) new_pr_node = get_master_primary_node(remaining_nodes) assert old_pr_node != new_pr_node sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Check if old primary can join the pool and still functions old_pr_node = start_stopped_node(old_pr_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [old_pr_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) assert len(getAllReturnVals(old_pr_node.view_changer, old_pr_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert not old_pr_node.view_changer._next_view_indications
def test_replica_removing_with_primary_disconnected(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf, tdir, allPluginsPath): """ 1. Remove backup primary node. 2. Check that replicas with the disconnected primary were removed. 3. Recover the removed node. 4. Start View Change. 5. Check that all replicas were restored. """ start_replicas_count = txnPoolNodeSet[0].replicas.num_replicas instance_to_remove = 1 node = txnPoolNodeSet[instance_to_remove] # remove backup primary node. disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node) txnPoolNodeSet.remove(node) looper.removeProdable(node) # check that replicas were removed def check_replica_removed_on_all_nodes(): for node in txnPoolNodeSet: check_replica_removed(node, start_replicas_count, instance_to_remove) looper.run(eventually(check_replica_removed_on_all_nodes, timeout=tconf.TolerateBackupPrimaryDisconnection * 4)) assert not node.monitor.isMasterDegraded() assert len(node.requests) == 0 # recover the removed node node = start_stopped_node(node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) # start View Change for node in txnPoolNodeSet: node.view_changer.on_master_degradation() waitForViewChange(looper, txnPoolNodeSet, expectedViewNo=1, customTimeout=2 * tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) # check that all replicas were restored assert start_replicas_count == node.replicas.num_replicas
def test_node_catchup_after_restart_no_txns( sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf, sdk_node_set_with_node_added_after_some_txns, tdirWithPoolTxns, allPluginsPath): """ A node restarts but no transactions have happened while it was down. It would then use the `LedgerStatus` to catchup """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = sdk_node_set_with_node_added_after_some_txns waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) logger.debug("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(name=new_node.name) logger.debug("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) new_node = TestNode( new_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node looper.run(checkNodesConnected(txnPoolNodeSet)) def chk(): for node in txnPoolNodeSet[:-1]: check_last_ordered_3pc(new_node, node) looper.run(eventually(chk, retryWait=1)) # sendReqsToNodesAndVerifySuffReplies(looper, wallet, client, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1]) # Did not receive any consistency proofs assert get_count(new_node.ledgerManager, new_node.ledgerManager.processConsistencyProof) == 0
def test_send_pool_config_2_nodes_can_force_writes_false_force_true( nodeSet, looper, sdk_pool_handle, sdk_wallet_trustee, poolConfigWFFT): assert len(nodeSet) == 4 node1 = nodeSet[2] node0 = nodeSet[3] check_pool_config_writable_set(nodeSet, True) disconnect_node_and_ensure_disconnected( looper, nodeSet, node0.name, stopNode=False) disconnect_node_and_ensure_disconnected( looper, nodeSet, node1.name, stopNode=False) sdk_pool_config_sent(looper, sdk_pool_handle, sdk_wallet_trustee, poolConfigWFFT) looper.run(eventually(check_pool_config_writable_set, nodeSet[0:2], False, retryWait=1, timeout=10))
def test_old_non_primary_restart_after_view_change(new_node_in_correct_view, looper, txnPoolNodeSet, tdir, allPluginsPath, tconf, sdk_pool_handle, sdk_wallet_client): """ An existing non-primary node crashes and then view change happens, the crashed node comes back up after view change """ node_to_stop = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node old_view_no = node_to_stop.viewNo # Stop non-primary disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_stop, stopNode=True) looper.removeProdable(node_to_stop) remaining_nodes = list(set(txnPoolNodeSet) - {node_to_stop}) # Send some requests before view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_view_change(looper, remaining_nodes, custom_timeout=tconf.VIEW_CHANGE_TIMEOUT) ensureElectionsDone(looper, remaining_nodes) # Send some requests after view change sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) restarted_node = start_stopped_node(node_to_stop, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet = remaining_nodes + [restarted_node] looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet, old_view_no + 1, timeout=30)) assert len(getAllReturnVals(restarted_node.view_changer, restarted_node.view_changer._start_view_change_if_possible, compare_val_to=True)) > 0 ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) assert not restarted_node.view_changer._next_view_indications
def test_cancel_request_cp_and_ls_after_catchup(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_steward, tconf, tdir, allPluginsPath): '''Test cancel of schedule with requesting ledger statuses and consistency proofs after catchup.''' node_to_disconnect = txnPoolNodeSet[-1] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 5) # restart node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node_to_disconnect) looper.removeProdable(name=node_to_disconnect.name) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, 2) # add node_to_disconnect to pool node_to_disconnect = start_stopped_node(node_to_disconnect, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = node_to_disconnect looper.run(checkNodesConnected(txnPoolNodeSet)) waitNodeDataEquality(looper, node_to_disconnect, *txnPoolNodeSet) # check cancel of schedule with requesting ledger statuses and consistency proofs assert len(node_to_disconnect.ledgerManager.request_ledger_status_action_ids) == 0 for action, aids in node_to_disconnect.ledgerManager.scheduled.items(): if getCallableName(action) == 'reask_for_ledger_status': assert len(aids) == 0 assert len(node_to_disconnect.ledgerManager.request_consistency_proof_action_ids) == 0 for action, aids in node_to_disconnect.ledgerManager.scheduled.items(): if getCallableName(action) == 'reask_for_last_consistency_proof': assert len(aids) == 0
def test_number_txns_in_catchup_and_vc_queue_valid(looper, txnPoolNodeSet, tconf, sdk_pool_handle, sdk_wallet_steward): num_txns = 5 master_node = get_master_primary_node(txnPoolNodeSet) old_view = master_node.viewNo expected_view_no = old_view + 1 disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, master_node, stopNode=False) looper.run(eventually(checkViewNoForNodes, txnPoolNodeSet[1:], expected_view_no, retryWait=1, timeout=tconf.VIEW_CHANGE_TIMEOUT)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_steward, num_txns) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, master_node) waitNodeDataEquality(looper, master_node, *txnPoolNodeSet[-1:]) latest_info = master_node._info_tool.info assert latest_info['Node_info']['Catchup_status']['Number_txns_in_catchup'][1] == num_txns assert latest_info['Node_info']['View_change_status']['View_No'] == expected_view_no node_names = [n.name for n in txnPoolNodeSet[1:]] for node_name in node_names: assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][0] == master_node.master_primary_name assert latest_info['Node_info']['View_change_status']['VCDone_queue'][node_name][1] assert latest_info['Node_info']['View_change_status']['Last_complete_view_no'] == expected_view_no
def test_current_state_propagation(sdk_new_node_caught_up, txnPoolNodeSet, sdk_node_set_with_node_added_after_some_txns, tconf, tdir, allPluginsPath): """ Checks that nodes send CurrentState to lagged nodes. """ # 1. Start pool looper, new_node, _, _ = sdk_node_set_with_node_added_after_some_txns # 2. Stop one node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node, stopNode=True) looper.removeProdable(new_node) # 3. Start it again restarted_node = start_stopped_node(new_node, looper, tconf, tdir, allPluginsPath) txnPoolNodeSet[-1] = restarted_node looper.run(checkNodesConnected(txnPoolNodeSet)) looper.runFor(5) # 4. Check that all nodes sent CurrentState for node in txnPoolNodeSet[:-1]: sent_times = node.spylog.count( node.send_current_state_to_lagging_node.__name__) assert sent_times != 0, "{} haven't sent CurrentState".format(node) looper.runFor(5) # 5. Check that it received CurrentState messages received_times = restarted_node.spylog.count( restarted_node.process_current_state_message.__name__) assert received_times != 0
def stop_node(node_to_stop, looper, pool_nodes): disconnect_node_and_ensure_disconnected(looper, pool_nodes, node_to_stop) looper.removeProdable(node_to_stop)
def test_disconnected_node_with_lagged_view_pulls_up_its_view_on_reconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Verifies that a disconnected node with a lagged view accepts the current view from the other nodes on re-connection. Steps: 1. Provoke view change to 1. 2. Ensure that all the nodes complete view change to 1. 3. Disconnect one node from the rest of the nodes in the pool. 4. Provoke view change to 2. 5. Ensure that that all the nodes except for the disconnected one complete view change to 2 and the disconnected node remains in the view 1. 6. Provoke view change to 3. 5. Ensure that that all the nodes except for the disconnected one complete view change to 3 and the disconnected node remains in the view 1. 8. Connect the disconnected node to the rest of the nodes in the pool. 9. Ensure that the re-connected node completes view change to 3. 10. Ensure that all the nodes participate in consensus. """ checkViewNoForNodes(txnPoolNodeSet, 0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, txnPoolNodeSet) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) lagged_node = getNonPrimaryReplicas(txnPoolNodeSet)[-1].node disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, lagged_node, stopNode=False) other_nodes = list(set(txnPoolNodeSet) - {lagged_node}) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 2) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_view_change(looper, other_nodes) ensureElectionsDone(looper, other_nodes, instances_list=range(getRequiredInstances(len(txnPoolNodeSet)))) ensure_all_nodes_have_same_data(looper, other_nodes) checkViewNoForNodes(other_nodes, 3) checkViewNoForNodes([lagged_node], 1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, lagged_node) waitForViewChange(looper, [lagged_node], 3, customTimeout=waits.expectedPoolElectionTimeout( len(txnPoolNodeSet))) ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) checkViewNoForNodes(txnPoolNodeSet, 3) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)