def test_node_requests_missing_three_phase_messages(looper, txnPoolNodeSet, wallet1, client1Connected): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1Connected, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) init_ledger_size = txnPoolNodeSet[0].domainLedger.size for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, node, stopNode=False) sendRandomRequests(wallet1, client1Connected, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, node) send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1Connected, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def testClientConnectToRestartedNodes(looper, txnPoolNodeSet, tdir, tconf, poolTxnNodeNames, allPluginsPath, sdk_wallet_new_client, sdk_pool_handle): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_new_client, 1) for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) txnPoolNodeSet = [] for nm in poolTxnNodeNames: config_helper = PNodeConfigHelper(nm, tconf, chroot=tdir) node = TestNode(nm, config_helper=config_helper, config=tconf, pluginPaths=allPluginsPath) looper.add(node) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) def chk(): for node in txnPoolNodeSet: assert node.isParticipating timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run(eventually(chk, retryWait=1, timeout=timeout)) sdk_pool_refresh(looper, sdk_pool_handle) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_new_client, sdk_pool_handle)
def test_node_load_after_add_then_disconnect(sdk_new_node_caught_up, txnPoolNodeSet, tconf, looper, sdk_pool_handle, sdk_wallet_client, tdirWithPoolTxns, allPluginsPath, capsys): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ new_node = sdk_new_node_caught_up with capsys.disabled(): print("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) client_batches = 80 txns_per_batch = 10 for i in range(client_batches): s = perf_counter() sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, txns_per_batch) with capsys.disabled(): print('{} executed {} client txns in {:.2f} seconds'. format(i + 1, txns_per_batch, perf_counter() - s)) with capsys.disabled(): print("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) new_node = TestNode( new_node.name, basedirpath=tdirWithPoolTxns, base_data_dir=tdirWithPoolTxns, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(new_node) txnPoolNodeSet[-1] = new_node # Delay catchup reply processing so LedgerState does not change delay_catchup_reply = 5 new_node.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, new_node, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4], customTimeout=timeout) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:4])
def testOrderingCase2(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Scenario -> A client sends requests, some nodes delay COMMITs to few specific nodes such some nodes achieve commit quorum later for those requests compared to other nodes. But all nodes `ORDER` request in the same order of ppSeqNos https://www.pivotaltracker.com/n/projects/1889887/stories/133655009 """ pr, replicas = getPrimaryReplica(txnPoolNodeSet, instId=0), \ getNonPrimaryReplicas(txnPoolNodeSet, instId=0) assert len(replicas) == 6 rep0 = pr rep1 = replicas[0] rep2 = replicas[1] rep3 = replicas[2] rep4 = replicas[3] rep5 = replicas[4] rep6 = replicas[5] node0 = rep0.node node1 = rep1.node node2 = rep2.node node3 = rep3.node node4 = rep4.node node5 = rep5.node node6 = rep6.node ppSeqsToDelay = 5 commitDelay = 3 # delay each COMMIT by this number of seconds delayedPpSeqNos = set() requestCount = 10 def specificCommits(wrappedMsg): nonlocal node3, node4, node5 msg, sender = wrappedMsg if isinstance(msg, PrePrepare): if len(delayedPpSeqNos) < ppSeqsToDelay: delayedPpSeqNos.add(msg.ppSeqNo) logger.debug('ppSeqNo {} be delayed'.format(msg.ppSeqNo)) if isinstance(msg, Commit) and msg.instId == 0 and \ sender in (n.name for n in (node3, node4, node5)) and \ msg.ppSeqNo in delayedPpSeqNos: return commitDelay for node in (node1, node2): logger.debug('{} would be delaying commits'.format(node)) node.nodeIbStasher.delay(specificCommits) sdk_reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, requestCount) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=timeout) sdk_get_and_check_replies(looper, sdk_reqs)
def testNonPrimaryRecvs3PhaseMessageOutsideWatermarks(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected, reqs_for_logsize): """ A node is slow in processing PRE-PREPAREs and PREPAREs such that lot of requests happen and the slow node has started getting 3 phase messages outside of it watermarks. Check that it queues up requests outside watermarks and once it has received stable checkpoint it processes more requests. It sends other nodes 3 phase messages older than their stable checkpoint so they should discard them. """ delay = 15 instId = 1 reqsToSend = reqs_for_logsize + 2 npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) slowReplica = npr[0] slowNode = slowReplica.node slowNode.nodeIbStasher.delay(ppDelay(delay, instId)) slowNode.nodeIbStasher.delay(pDelay(delay, instId)) def discardCounts(replicas, pat): counts = {} for r in replicas: counts[r.name] = countDiscarded(r, pat) return counts oldStashCount = slowReplica.spylog.count( TestReplica.stashOutsideWatermarks.__name__) oldDiscardCounts = discardCounts( [n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqsToSend, 1) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run( eventually(checkNodeDataForEquality, slowNode, *[_ for _ in txnPoolNodeSet if _ != slowNode], retryWait=1, timeout=timeout)) newStashCount = slowReplica.spylog.count( TestReplica.stashOutsideWatermarks.__name__) assert newStashCount > oldStashCount def chk(): counts = discardCounts( [n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') for nm, count in counts.items(): assert count > oldDiscardCounts[nm] timeout = waits.expectedNodeToNodeMessageDeliveryTime() * \ len(txnPoolNodeSet) + delay looper.run(eventually(chk, retryWait=1, timeout=timeout))
def testNodeRequestingTxns(reduced_catchup_timeout_conf, txnPoolNodeSet, sdk_node_created_after_some_txns, sdk_wallet_client): """ A newly joined node is catching up and sends catchup requests to other nodes but one of the nodes does not reply and the newly joined node cannot complete the process till the timeout and then requests the missing transactions. """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = sdk_node_created_after_some_txns new_node_ledger = new_node.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID] old_size = len(new_node_ledger.ledger) old_size_others = txnPoolNodeSet[0].ledgerManager.ledgerRegistry[ DOMAIN_LEDGER_ID].ledger.size # So nodes wont tell the clients about the newly joined node so they # dont send any request to the newly joined node for node in txnPoolNodeSet: node.sendPoolInfoToClients = types.MethodType(lambda x, y: None, node) def ignoreCatchupReq(self, req, frm): logger.info("{} being malicious and ignoring catchup request {} " "from {}".format(self, req, frm)) # One of the node does not process catchup request. npr = getNonPrimaryReplicas(txnPoolNodeSet, 0) badReplica = npr[0] badNode = badReplica.node txnPoolNodeSet.append(new_node) badNode.nodeMsgRouter.routes[CatchupReq] = types.MethodType( ignoreCatchupReq, badNode.ledgerManager) more_requests = 10 sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, more_requests) looper.run(checkNodesConnected(txnPoolNodeSet)) # Since one of the nodes does not reply, this new node will experience a # timeout and retry catchup requests, hence a long test timeout. timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ reduced_catchup_timeout_conf.CatchupTransactionsTimeout waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout) new_size = len(new_node_ledger.ledger) # The new node ledger might catchup some transactions from the batch of # `more_request` transactions assert old_size_others - \ old_size <= new_node_ledger.num_txns_caught_up <= new_size - old_size sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 2) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout)
def ensureClientConnectedToNodesAndPoolLedgerSame(looper, client: TestClient, *nodes: TestNode): looper.run(client.ensureConnectedToNodes()) timeout = waits.expectedPoolGetReadyTimeout(len(nodes)) looper.run( eventually(checkClientPoolLedgerSameAsNodes, client, *nodes, retryWait=.5, timeout=timeout))
def testNodeRequestingTxns(reduced_catchup_timeout_conf, txnPoolNodeSet, looper, tdir, tconf, allPluginsPath, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client): """ A newly joined node is catching up and sends catchup requests to other nodes but one of the nodes does not reply and the newly joined node cannot complete the process till the timeout and then requests the missing transactions. """ def ignoreCatchupReq(self, req, frm): logger.info("{} being malicious and ignoring catchup request {} " "from {}".format(self, req, frm)) # One of the node does not process catchup request. npr = getNonPrimaryReplicas(txnPoolNodeSet, 0) badReplica = npr[0] badNode = badReplica.node badNode.nodeMsgRouter.routes[CatchupReq] = types.MethodType( ignoreCatchupReq, badNode.ledgerManager) more_requests = 10 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, more_requests) _, new_node = sdk_add_new_steward_and_node(looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) # Since one of the nodes does not reply, this new node will experience a # timeout and retry catchup requests, hence a long test timeout. timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ reduced_catchup_timeout_conf.CatchupTransactionsTimeout waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout, exclude_from_check=['check_last_ordered_3pc_backup']) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 2) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout, exclude_from_check=['check_last_ordered_3pc_backup'])
def testClientConnectToRestartedNodes(looper, txnPoolNodeSet, tdirWithPoolTxns, tdir, tdirWithClientPoolTxns, poolTxnClientNames, poolTxnData, tconf, poolTxnNodeNames, allPluginsPath): name = poolTxnClientNames[-1] newClient, w = genTestClient(tmpdir=tdirWithClientPoolTxns, nodes=txnPoolNodeSet, name=name, usePoolLedger=True) looper.add(newClient) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 1, 1) for node in txnPoolNodeSet: node.stop() looper.removeProdable(node) # looper.run(newClient.ensureDisconnectedToNodes(timeout=60)) txnPoolNodeSet = [] for nm in poolTxnNodeNames: config_helper = PNodeConfigHelper(nm, tconf, chroot=tdir) node = TestNode(nm, ledger_dir=config_helper.ledger_dir, keys_dir=config_helper.keys_dir, genesis_dir=config_helper.genesis_dir, plugins_dir=config_helper.plugins_dir, config=tconf, pluginPaths=allPluginsPath) looper.add(node) txnPoolNodeSet.append(node) looper.run(checkNodesConnected(txnPoolNodeSet)) ensureElectionsDone(looper=looper, nodes=txnPoolNodeSet) def chk(): for node in txnPoolNodeSet: assert node.isParticipating timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run(eventually(chk, retryWait=1, timeout=timeout)) bootstrapClientKeys(w.defaultId, w.getVerkey(), txnPoolNodeSet) req = sendRandomRequest(w, newClient) waitForSufficientRepliesForRequests(looper, newClient, requests=[req]) ensureClientConnectedToNodesAndPoolLedgerSame(looper, newClient, *txnPoolNodeSet) sendReqsToNodesAndVerifySuffReplies(looper, w, newClient, 3, 1)
def waitNodeDataInequality(looper, referenceNode: TestNode, *otherNodes: TestNode, customTimeout=None): """ Wait for node ledger to become equal :param referenceNode: node whose ledger used as a reference """ numOfNodes = len(otherNodes) + 1 timeout = customTimeout or waits.expectedPoolGetReadyTimeout(numOfNodes) looper.run(eventually(checkNodeDataForInequality, referenceNode, *otherNodes, retryWait=1, timeout=timeout))
def waitNodeDataEquality(looper, referenceNode: TestNode, *otherNodes: TestNode, customTimeout=None, exclude_from_check=None): """ Wait for node ledger to become equal :param referenceNode: node whose ledger used as a reference """ numOfNodes = len(otherNodes) + 1 timeout = customTimeout or waits.expectedPoolGetReadyTimeout(numOfNodes) kwargs = {'exclude_from_check': exclude_from_check} looper.run(eventually(partial(checkNodeDataForEquality, **kwargs), referenceNode, *otherNodes, retryWait=1, timeout=timeout))
def waitNodeDataInequality(looper, referenceNode: TestNode, *otherNodes: TestNode, exclude_from_check=None, customTimeout=None): """ Wait for node ledger to become equal :param referenceNode: node whose ledger used as a reference """ numOfNodes = len(otherNodes) + 1 timeout = customTimeout or waits.expectedPoolGetReadyTimeout(numOfNodes) kwargs = {'exclude_from_check': exclude_from_check} looper.run(eventually(partial(checkNodeDataForInequality, **kwargs), referenceNode, *otherNodes, retryWait=1, timeout=timeout))
def testNodeRequestingTxns(reduced_catchup_timeout_conf, txnPoolNodeSet, looper, tdir, tconf, allPluginsPath, sdk_pool_handle, sdk_wallet_steward, sdk_wallet_client): """ A newly joined node is catching up and sends catchup requests to other nodes but one of the nodes does not reply and the newly joined node cannot complete the process till the timeout and then requests the missing transactions. """ def ignoreCatchupReq(self, req, frm): logger.info("{} being malicious and ignoring catchup request {} " "from {}".format(self, req, frm)) # One of the node does not process catchup request. npr = getNonPrimaryReplicas(txnPoolNodeSet, 0) badReplica = npr[0] badNode = badReplica.node badNode.nodeMsgRouter.routes[CatchupReq] = types.MethodType( ignoreCatchupReq, badNode.ledgerManager) more_requests = 10 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, more_requests) _, new_node = sdk_add_new_steward_and_node( looper, sdk_pool_handle, sdk_wallet_steward, 'EpsilonSteward', 'Epsilon', tdir, tconf, allPluginsPath=allPluginsPath) txnPoolNodeSet.append(new_node) looper.run(checkNodesConnected(txnPoolNodeSet)) # Since one of the nodes does not reply, this new node will experience a # timeout and retry catchup requests, hence a long test timeout. timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ reduced_catchup_timeout_conf.CatchupTransactionsTimeout waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 2) waitNodeDataEquality(looper, new_node, *txnPoolNodeSet[:-1], customTimeout=timeout)
def test_node_catchup_after_restart_with_txns( sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf, sdk_node_set_with_node_added_after_some_txns, allPluginsPath): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Stopping node {} with pool ledger size {}".format( new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") more_requests = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, more_requests) logger.debug("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) newNode = TestNode(new_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode # Make sure ledger is not synced initially check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced) # Delay catchup reply processing so LedgerState does not change # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state' # fails because newNode's domain ledger state is 'synced' delay_catchup_reply = 10 newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run( eventually(check_ledger_state, newNode, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) confused_node = txnPoolNodeSet[0] new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID] cp = new_node_ledger.catchUpTill start, end = cp.seqNoStart, cp.seqNoEnd cons_proof = confused_node.ledgerManager._buildConsistencyProof( DOMAIN_LEDGER_ID, start, end) bad_send_time = None def chk(): nonlocal bad_send_time entries = newNode.ledgerManager.spylog.getAll( newNode.ledgerManager.canProcessConsistencyProof.__name__) for entry in entries: # `canProcessConsistencyProof` should return False after `syncing_time` if entry.result == False and entry.starttime > bad_send_time: return assert False def send_and_chk(ledger_state): nonlocal bad_send_time, cons_proof bad_send_time = perf_counter() confused_node.ledgerManager.sendTo(cons_proof, newNode.name) # Check that the ConsistencyProof messages rejected looper.run(eventually(chk, retryWait=.5, timeout=5)) check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state) send_and_chk(LedgerState.syncing) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1], customTimeout=timeout) assert new_node_ledger.num_txns_caught_up == more_requests send_and_chk(LedgerState.synced)
def test_node_catchup_after_restart_with_txns( sdk_new_node_caught_up, txnPoolNodeSet, tdir, tconf, sdk_node_set_with_node_added_after_some_txns, allPluginsPath): """ A node that restarts after some transactions should eventually get the transactions which happened while it was down :return: """ looper, new_node, sdk_pool_handle, new_steward_wallet_handle = \ sdk_node_set_with_node_added_after_some_txns logger.debug("Stopping node {} with pool ledger size {}". format(new_node, new_node.poolManager.txnSeqNo)) disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, new_node) looper.removeProdable(new_node) # for n in txnPoolNodeSet[:4]: # for r in n.nodestack.remotes.values(): # if r.name == newNode.name: # r.removeStaleCorrespondents() # looper.run(eventually(checkNodeDisconnectedFrom, newNode.name, # txnPoolNodeSet[:4], retryWait=1, timeout=5)) # TODO: Check if the node has really stopped processing requests? logger.debug("Sending requests") more_requests = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, new_steward_wallet_handle, more_requests) logger.debug("Starting the stopped node, {}".format(new_node)) nodeHa, nodeCHa = HA(*new_node.nodestack.ha), HA(*new_node.clientstack.ha) config_helper = PNodeConfigHelper(new_node.name, tconf, chroot=tdir) newNode = TestNode( new_node.name, config_helper=config_helper, config=tconf, ha=nodeHa, cliha=nodeCHa, pluginPaths=allPluginsPath) looper.add(newNode) txnPoolNodeSet[-1] = newNode # Make sure ledger is not synced initially check_ledger_state(newNode, DOMAIN_LEDGER_ID, LedgerState.not_synced) # Delay catchup reply processing so LedgerState does not change # TODO fix delay, sometimes it's not enough and lower 'check_ledger_state' # fails because newNode's domain ledger state is 'synced' delay_catchup_reply = 10 newNode.nodeIbStasher.delay(cr_delay(delay_catchup_reply)) looper.run(checkNodesConnected(txnPoolNodeSet)) # Make sure ledger starts syncing (sufficient consistency proofs received) looper.run(eventually(check_ledger_state, newNode, DOMAIN_LEDGER_ID, LedgerState.syncing, retryWait=.5, timeout=5)) confused_node = txnPoolNodeSet[0] new_node_ledger = newNode.ledgerManager.ledgerRegistry[DOMAIN_LEDGER_ID] cp = new_node_ledger.catchUpTill start, end = cp.seqNoStart, cp.seqNoEnd cons_proof = confused_node.ledgerManager._buildConsistencyProof( DOMAIN_LEDGER_ID, start, end) bad_send_time = None def chk(): nonlocal bad_send_time entries = newNode.ledgerManager.spylog.getAll( newNode.ledgerManager.canProcessConsistencyProof.__name__) for entry in entries: # `canProcessConsistencyProof` should return False after `syncing_time` if entry.result == False and entry.starttime > bad_send_time: return assert False def send_and_chk(ledger_state): nonlocal bad_send_time, cons_proof bad_send_time = perf_counter() confused_node.ledgerManager.sendTo(cons_proof, newNode.name) # Check that the ConsistencyProof messages rejected looper.run(eventually(chk, retryWait=.5, timeout=5)) check_ledger_state(newNode, DOMAIN_LEDGER_ID, ledger_state) send_and_chk(LedgerState.syncing) # Not accurate timeout but a conservative one timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) + \ 2 * delay_catchup_reply waitNodeDataEquality(looper, newNode, *txnPoolNodeSet[:-1], customTimeout=timeout) send_and_chk(LedgerState.synced)
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) == 0 assert node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) > 0 assert node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def testOrderingCase2(looper, nodeSet, up, client1, wallet1): """ Scenario -> A client sends requests, some nodes delay COMMITs to few specific nodes such some nodes achieve commit quorum later for those requests compared to other nodes. But all nodes `ORDER` request in the same order of ppSeqNos https://www.pivotaltracker.com/n/projects/1889887/stories/133655009 """ pr, replicas = getPrimaryReplica(nodeSet, instId=0), \ getNonPrimaryReplicas(nodeSet, instId=0) assert len(replicas) == 6 rep0 = pr rep1 = replicas[0] rep2 = replicas[1] rep3 = replicas[2] rep4 = replicas[3] rep5 = replicas[4] rep6 = replicas[5] node0 = rep0.node node1 = rep1.node node2 = rep2.node node3 = rep3.node node4 = rep4.node node5 = rep5.node node6 = rep6.node ppSeqsToDelay = 5 commitDelay = 3 # delay each COMMIT by this number of seconds delayedPpSeqNos = set() requestCount = 10 def specificCommits(wrappedMsg): nonlocal node3, node4, node5 msg, sender = wrappedMsg if isinstance(msg, PrePrepare): if len(delayedPpSeqNos) < ppSeqsToDelay: delayedPpSeqNos.add(msg.ppSeqNo) logger.debug('ppSeqNo {} be delayed'.format(msg.ppSeqNo)) if isinstance(msg, Commit) and msg.instId == 0 and \ sender in (n.name for n in (node3, node4, node5)) and \ msg.ppSeqNo in delayedPpSeqNos: return commitDelay for node in (node1, node2): logger.debug('{} would be delaying commits'.format(node)) node.nodeIbStasher.delay(specificCommits) requests = sendRandomRequests(wallet1, client1, requestCount) waitForSufficientRepliesForRequests(looper, client1, requests=requests) def ensureSlowNodesHaveAllTxns(): nonlocal node1, node2 for node in node1, node2: assert len(node.domainLedger) == requestCount timeout = waits.expectedPoolGetReadyTimeout(len(nodeSet)) looper.run( eventually(ensureSlowNodesHaveAllTxns, retryWait=1, timeout=timeout)) checkAllLedgersEqual( (n.domainLedger for n in (node0, node3, node4, node5, node6))) for node in (node1, node2): for n in nodeSet: if n != node: checkLedgerEquality(node.domainLedger, n.domainLedger) checkAllLedgersEqual((n.domainLedger for n in nodeSet))
def test_node_requests_missing_preprepares_and_prepares_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica._request_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica._request_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_three_phase_messages_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, allPluginsPath): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPARES and PREPREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 10 MISSING_REQS_CNT = 1 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary is not None: alive_nodes.append(node) else: disconnected_nodes.append(node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) def check_pp_out_of_sync(alive_nodes, disconnected_nodes): def get_last_pp(node): return node.replicas._master_replica.lastPrePrepare last_3pc_key_alive = get_last_pp(alive_nodes[0]) for node in alive_nodes[1:]: assert get_last_pp(node) == last_3pc_key_alive last_3pc_key_diconnected = get_last_pp(disconnected_nodes[0]) assert last_3pc_key_diconnected != last_3pc_key_alive for node in disconnected_nodes[1:]: assert get_last_pp(node) == last_3pc_key_diconnected looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 1 of 4 nodes goes down. A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, disconnected_node, stopNode=False) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, disconnected_node) # Give time for the reconnected node to catch up if it is going to do it looper.runFor( waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size assert disconnected_node.master_replica.spylog.count( Replica._request_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica._request_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica._request_commit) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) == 0 doOrderTimesBefore = disconnected_node.master_replica.spylog.count( Replica.doOrder) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert disconnected_node.master_replica.spylog.count( Replica._request_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica._request_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica._request_commit) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) > 0 doOrderTimesAfter = disconnected_node.master_replica.spylog.count( Replica.doOrder) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 1 of 4 nodes goes down. A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size disconnect_node_and_ensure_disconnected(looper, txnPoolNodeSet, disconnected_node, stopNode=False) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) reconnect_node_and_ensure_connected(looper, txnPoolNodeSet, disconnected_node) # Give time for the reconnected node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica._request_commit) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) == 0 doOrderTimesBefore = disconnected_node.master_replica.spylog.count(Replica.doOrder) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert disconnected_node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica._request_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica._request_commit) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count(Replica.process_requested_commit) > 0 doOrderTimesAfter = disconnected_node.master_replica.spylog.count(Replica.doOrder) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_prepares_and_commits( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tdir, allPluginsPath): """ 1 of 4 nodes goes down ((simulate this by dropping requests)). A new request comes in and is ordered by the 3 remaining nodes. After a while the previously disconnected node comes back alive. Another request comes in. Check that the previously disconnected node requests missing PREPREPARES, PREPARES and COMMITS, orders the previous request and all the nodes successfully handles the last request. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_node = txnPoolNodeSet[3] alive_nodes = txnPoolNodeSet[:3] disconnected_node_stashers = disconnected_node.nodeIbStasher sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_node_stashers, delay_3pc()): last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run( eventually(check_pp_out_of_sync, alive_nodes, [disconnected_node], last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout( len(txnPoolNodeSet)))) for node in alive_nodes: assert node.domainLedger.size == init_ledger_size + MISSING_REQS_CNT # Ensure that the reconnected node has not caught up though assert disconnected_node.domainLedger.size == init_ledger_size ordering_service = disconnected_node.master_replica._ordering_service assert ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert ordering_service.spylog.count(OrderingService._request_prepare) == 0 assert ordering_service.spylog.count(OrderingService._request_commit) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) == 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) == 0 doOrderTimesBefore = ordering_service.spylog.count( OrderingService._do_order) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_node, *alive_nodes) assert ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert ordering_service.spylog.count(OrderingService._request_prepare) > 0 assert ordering_service.spylog.count(OrderingService._request_commit) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_pre_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_prepare) > 0 assert disconnected_node.master_replica.spylog.count( Replica.process_requested_commit) > 0 doOrderTimesAfter = ordering_service.spylog.count( OrderingService._do_order) # Ensure that the reconnected node has ordered both the missed 3PC-batch and the new 3PC-batch assert doOrderTimesAfter - doOrderTimesBefore == 2 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))
def testOrderingCase2(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Scenario -> A client sends requests, some nodes delay COMMITs to few specific nodes such some nodes achieve commit quorum later for those requests compared to other nodes. But all nodes `ORDER` request in the same order of ppSeqNos https://www.pivotaltracker.com/n/projects/1889887/stories/133655009 """ pr, replicas = getPrimaryReplica(txnPoolNodeSet, instId=0), \ getNonPrimaryReplicas(txnPoolNodeSet, instId=0) assert len(replicas) == 6 rep0 = pr rep1 = replicas[0] rep2 = replicas[1] rep3 = replicas[2] rep4 = replicas[3] rep5 = replicas[4] rep6 = replicas[5] node0 = rep0.node node1 = rep1.node node2 = rep2.node node3 = rep3.node node4 = rep4.node node5 = rep5.node node6 = rep6.node ppSeqsToDelay = 5 commitDelay = 3 # delay each COMMIT by this number of seconds delayedPpSeqNos = set() requestCount = 10 def specificCommits(wrappedMsg): nonlocal node3, node4, node5 msg, sender = wrappedMsg if isinstance(msg, PrePrepare): if len(delayedPpSeqNos) < ppSeqsToDelay: delayedPpSeqNos.add(msg.ppSeqNo) logger.debug('ppSeqNo {} be delayed'.format(msg.ppSeqNo)) if isinstance(msg, Commit) and msg.instId == 0 and \ sender in (n.name for n in (node3, node4, node5)) and \ msg.ppSeqNo in delayedPpSeqNos: return commitDelay for node in (node1, node2): logger.debug('{} would be delaying commits'.format(node)) node.nodeIbStasher.delay(specificCommits) sdk_reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, requestCount) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=timeout) sdk_get_and_check_replies(looper, sdk_reqs)
def test_node_requests_missing_preprepares_and_prepares_after_long_disconnection( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdirWithPoolTxns, tdir, allPluginsPath): """ 2 of 4 nodes go down (simulate this by dropping requests), so pool can not process any more incoming requests. A new request comes in. Test than waits for some time to ensure that PrePrepare was created long enough seconds to be dropped by time checker. Two stopped nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 alive_nodes = [] disconnected_nodes = [] for node in txnPoolNodeSet: if node.hasPrimary: alive_nodes.append(node) else: disconnected_nodes.append(node) disconnected_nodes_stashers = [n.nodeIbStasher for n in disconnected_nodes] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_nodes_stashers, delay_3pc()): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc looper.run( eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout( len(txnPoolNodeSet)))) preprepare_deviation = 4 tconf.ACCEPTABLE_DEVIATION_PREPREPARE_SECS = preprepare_deviation time.sleep(preprepare_deviation * 2) for node in disconnected_nodes: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) == 0 assert node.master_replica._message_req_service.spylog.count( MessageReqService.process_message_rep) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count( OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count( OrderingService._request_prepare) > 0 assert node.master_replica._message_req_service.spylog.count( MessageReqService.process_message_rep) > 0 def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ 2 of 4 nodes go down, so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size current_node_set = set(txnPoolNodeSet) for node in disconnected_nodes: disconnect_node_and_ensure_disconnected(looper, current_node_set, node, stopNode=False) current_node_set.remove(node) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in disconnected_nodes: current_node_set.add(node) reconnect_node_and_ensure_connected(looper, current_node_set, node) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica._request_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) == 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica.spylog.count(Replica._request_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica._request_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_pre_prepare) > 0 assert node.master_replica.spylog.count(Replica.process_requested_prepare) > 0 for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT)
def test_node_requests_missing_preprepares_and_prepares( looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle, tconf, tdir, allPluginsPath): """ 2 of 4 nodes go down (simulate this by dropping requests), so pool can not process any more incoming requests. A new request comes in. After a while those 2 nodes come back alive. Another request comes in. Check that previously disconnected two nodes request missing PREPREPARES and PREPARES and the pool successfully handles both transactions after that. """ INIT_REQS_CNT = 5 MISSING_REQS_CNT = 4 REQS_AFTER_RECONNECT_CNT = 1 disconnected_nodes = txnPoolNodeSet[2:] alive_nodes = txnPoolNodeSet[:2] disconnected_nodes_stashers = [n.nodeIbStasher for n in disconnected_nodes] sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, INIT_REQS_CNT) init_ledger_size = txnPoolNodeSet[0].domainLedger.size with delay_rules_without_processing(disconnected_nodes_stashers, delay_3pc()): sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, MISSING_REQS_CNT) last_ordered_key = txnPoolNodeSet[0].master_replica.last_ordered_3pc looper.run(eventually(check_pp_out_of_sync, alive_nodes, disconnected_nodes, last_ordered_key, retryWait=1, timeout=expectedPoolGetReadyTimeout(len(txnPoolNodeSet)))) for node in txnPoolNodeSet: assert node.domainLedger.size == init_ledger_size for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count(OrderingService._request_pre_prepare) == 0 assert node.master_replica._ordering_service.spylog.count(OrderingService._request_prepare) == 0 assert node.master_replica._message_req_service.spylog.count(MessageReqService.process_message_rep) == 0 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, REQS_AFTER_RECONNECT_CNT) waitNodeDataEquality(looper, disconnected_nodes[0], *txnPoolNodeSet[:-1]) for node in disconnected_nodes: assert node.master_replica._ordering_service.spylog.count(OrderingService._request_pre_prepare) > 0 assert node.master_replica._ordering_service.spylog.count(OrderingService._request_prepare) > 0 assert node.master_replica._message_req_service.spylog.count(MessageReqService.process_message_rep) > 0 def check_all_ordered(): for node in txnPoolNodeSet: assert node.domainLedger.size == (init_ledger_size + MISSING_REQS_CNT + REQS_AFTER_RECONNECT_CNT) looper.run(eventually(check_all_ordered, timeout=20))