def testNonPrimarySendsAPrePrepare(looper, txnPoolNodeSet, setup, propagated1): nonPrimaryReplicas = getNonPrimaryReplicas(txnPoolNodeSet, instId) firstNpr = nonPrimaryReplicas[0] remainingNpr = nonPrimaryReplicas[1:] def sendPrePrepareFromNonPrimary(): firstNpr.requestQueues[DOMAIN_LEDGER_ID].add(propagated1.key) ppReq = firstNpr.create3PCBatch(DOMAIN_LEDGER_ID) firstNpr.sendPrePrepare(ppReq) return ppReq ppr = sendPrePrepareFromNonPrimary() def chk(): for r in remainingNpr: recvdPps = recvd_pre_prepares(r) assert len(recvdPps) == 1 assert compareNamedTuple(recvdPps[0], ppr, f.DIGEST.nm, f.STATE_ROOT.nm, f.TXN_ROOT.nm) nodeSuspicions = len(getNodeSuspicions( r.node, Suspicions.PPR_FRM_NON_PRIMARY.code)) assert nodeSuspicions == 1 timeout = waits.expectedClientRequestPropagationTime(len(txnPoolNodeSet)) looper.run(eventually(chk, retryWait=.5, timeout=timeout))
def chk(): for r in getNonPrimaryReplicas(nodeSet, instId): l = len([ param for param in getAllArgs(r, r.processPrepare) if param['sender'] == primary.name ]) assert l == 1
def testPrimarySendsAPrepareAndMarkedSuspicious(looper, txnPoolNodeSet, delay_commits, preprepared1): def sendPrepareFromPrimary(instId): primary = getPrimaryReplica(txnPoolNodeSet, instId) viewNo, ppSeqNo = next( iter(primary._ordering_service.sent_preprepares.keys())) ppReq = primary._ordering_service.sent_preprepares[viewNo, ppSeqNo] primary._ordering_service._do_prepare(ppReq) def chk(): for r in getNonPrimaryReplicas(txnPoolNodeSet, instId): l = len([ param for param in getAllArgs( r._ordering_service, r._ordering_service.process_prepare) if param['sender'] == primary.name ]) assert l == 1 looper.run(eventually(chk)) sendPrepareFromPrimary(0) for node in txnPoolNodeSet: if node in getNonPrimaryReplicas(txnPoolNodeSet, 0): frm, reason, code = getAllArgs(node, TestNode.reportSuspiciousNode) assert frm == getPrimaryReplica(txnPoolNodeSet, 0).node.name assert isinstance(reason, SuspiciousNode) assert len(getNodeSuspicions(node, Suspicions.PR_FRM_PRIMARY.code)) == 10
def test_primary_recvs_3phase_message_outside_watermarks(perf_chk_patched, chkFreqPatched, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_logsize): """ One of the primary starts getting lot of requests, more than his log size and queues up requests since they will go beyond its watermarks. This happens since other nodes are slow in processing its PRE-PREPARE. Eventually this primary will send PRE-PREPARE for all requests and those requests will complete """ tconf = perf_chk_patched delay = 5 instId = 0 reqs_to_send = 2 * reqs_for_logsize + 1 logger.debug('Will send {} requests'.format(reqs_to_send)) npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) pr = getPrimaryReplica(txnPoolNodeSet, instId) from plenum.server.replica import TPCStat orderedCount = pr.stats.get(TPCStat.OrderSent) for r in npr: r.node.nodeIbStasher.delay(ppDelay(delay, instId)) r.node.nodeIbStasher.delay(pDelay(delay, instId)) tm_exec_1_batch = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) batch_count = math.ceil(reqs_to_send / tconf.Max3PCBatchSize) total_timeout = (tm_exec_1_batch + delay) * batch_count def chk(): assert orderedCount + batch_count == pr.stats.get(TPCStat.OrderSent) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_to_send) looper.run(eventually(chk, retryWait=1, timeout=total_timeout))
def testOrderingWhenPrePrepareNotReceived(looper, nodeSet, up, client1, wallet1): """ Send commits and prepares but delay pre-prepare such that enough prepares and commits are received, now the request should not be ordered until pre-prepare is received and ordering should just happen once, """ nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) slowRep = nonPrimReps[0] slowNode = slowRep.node slowNode.nodeIbStasher.delay(ppDelay(10, 0)) sendRandomRequest(wallet1, client1) stash = [] origMethod = slowRep.processReqDigest def patched(self, msg): stash.append(msg) patchedMethod = types.MethodType(patched, slowRep) slowRep.processReqDigest = patchedMethod def chk1(): assert len(slowRep.commitsWaitingForPrepare) > 0 looper.run(eventually(chk1, timeout=4)) for item in stash: origMethod(item) def chk2(): assert len(slowRep.commitsWaitingForPrepare) == 0 assert slowRep.spylog.count(slowRep.doOrder.__name__) == 1 looper.run(eventually(chk2, timeout=12))
def testOrderingCase1(looper, nodeSet, up, client1, wallet1): """ Scenario -> PRE-PREPARE not received by the replica, Request not received for ordering by the replica, but received enough commits to start ordering. It queues up the request so when a PRE-PREPARE is received or request is receievd for ordering, an order can be triggered https://www.pivotaltracker.com/story/show/125239401 Reproducing by - Pick a node with no primary replica, replica ignores forwarded request to replica and delay reception of PRE-PREPARE sufficiently so that enough COMMITs reach to trigger ordering. """ delay = 10 replica = getNonPrimaryReplicas(nodeSet, instId=0)[0] delaysPrePrepareProcessing(replica.node, delay=delay, instId=0) def doNotProcessReqDigest(self, rd: ReqDigest): pass patchedMethod = types.MethodType(doNotProcessReqDigest, replica) replica.processRequest = patchedMethod def chk(n): assert replica.spylog.count(replica.doOrder.__name__) == n sendRandomRequest(wallet1, client1) timeout = delay - 5 looper.run(eventually(chk, 0, retryWait=1, timeout=timeout)) timeout = delay + 5 looper.run(eventually(chk, 1, retryWait=1, timeout=timeout))
def test_checkpoints_removed_on_master_non_primary_replica_after_catchup( chkFreqPatched, txnPoolNodeSet, view_setup, clear_checkpoints): replica = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1] others = set(getAllReplicas(txnPoolNodeSet, 0)) - {replica} node = replica.node node.master_replica.last_ordered_3pc = (2, 12) replica.checkpoints[(6, 10)] = CheckpointState(seqNo=10, digests=[], digest='digest-6-10', receivedDigests={r.name: 'digest-6-10' for r in others}, isStable=True) replica.checkpoints[(11, 15)] = CheckpointState(seqNo=12, digests=['digest-11', 'digest-12'], digest=None, receivedDigests={}, isStable=False) replica.stashedRecvdCheckpoints[2] = {} replica.stashedRecvdCheckpoints[2][(11, 15)] = {} for r in others: replica.stashedRecvdCheckpoints[2][(11, 15)][r.name] = \ Checkpoint(instId=0, viewNo=2, seqNoStart=11, seqNoEnd=15, digest='digest-11-15') replica.stashedRecvdCheckpoints[2][(16, 20)] = {} for r in others: replica.stashedRecvdCheckpoints[2][(16, 20)][r.name] = \ Checkpoint(instId=0, viewNo=2, seqNoStart=16, seqNoEnd=20, digest='digest-16-20') replica.stashedRecvdCheckpoints[2][(21, 25)] = {} replica.stashedRecvdCheckpoints[2][(21, 25)][next(iter(others)).name] = \ Checkpoint(instId=0, viewNo=2, seqNoStart=21, seqNoEnd=25, digest='digest-21-25') # Simulate catch-up completion node.ledgerManager.last_caught_up_3PC = (2, 20) node.allLedgersCaughtUp() assert len(replica.checkpoints) == 0 assert len(replica.stashedRecvdCheckpoints) == 1 assert 2 in replica.stashedRecvdCheckpoints assert len(replica.stashedRecvdCheckpoints[2]) == 1 assert (21, 25) in replica.stashedRecvdCheckpoints[2] assert len(replica.stashedRecvdCheckpoints[2][(21, 25)]) == 1
def test_slow_node_has_warn_unordered_log_msg(looper, txnPoolNodeSet, wallet1, client1, patch_monitors): npr = getNonPrimaryReplicas(txnPoolNodeSet, 0)[0] slow_node = npr.node monitor = txnPoolNodeSet[0].monitor delay = monitor.WARN_NOT_PARTICIPATING_MIN_DIFF_SEC * \ monitor.WARN_NOT_PARTICIPATING_UNORDERED_NUM + 10 delaysCommitProcessing(slow_node, delay=delay) assert no_any_warn(*txnPoolNodeSet), \ 'all nodes do not have warnings before test' for i in range(monitor.WARN_NOT_PARTICIPATING_UNORDERED_NUM): req = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[req]) looper.runFor(monitor.WARN_NOT_PARTICIPATING_MIN_DIFF_SEC) others = [node for node in txnPoolNodeSet if node.name != slow_node.name] assert no_any_warn(*others), \ 'others do not have warning after test' assert has_some_warn(slow_node), \ 'slow node has the warning' ordered_requests_keys_len_before = len(monitor.ordered_requests_keys) # wait at least windows time looper.runFor(monitor.WARN_NOT_PARTICIPATING_WINDOW_MINS * 60) req = sendRandomRequest(wallet1, client1) waitForSufficientRepliesForRequests(looper, client1, requests=[req]) assert no_any_warn(*others), 'others do not have warning' assert no_last_warn(slow_node), \ 'the last call of warn_has_lot_unordered_requests returned False ' \ 'so slow node has no the warning for now' assert len(monitor.ordered_requests_keys) < ordered_requests_keys_len_before, \ "ordered_requests_keys was cleaned up"
def test_commits_recvd_first(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=20, num_batches=4) assert not slow_node.master_replica.prePrepares assert not slow_node.master_replica.prepares assert not slow_node.master_replica.commits assert len(slow_node.master_replica.commitsWaitingForPrepare) > 0 slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list([n.master_replica.ordered for n in txnPoolNodeSet]) assert slow_node.master_replica.prePrepares assert slow_node.master_replica.prepares assert slow_node.master_replica.commits assert not slow_node.master_replica.commitsWaitingForPrepare
def test_forced_upgrade_handled_once_if_ordered_and_then_request_received( looper, nodeSet, sdk_pool_handle, sdk_wallet_trustee, validUpgradeExpForceTrue): """ Verifies that POOL_UPGRADE force=true request is handled one time in case the node commits the transaction to the ledger and only after that receives the request directly from the client """ slow_node = getNonPrimaryReplicas(nodeSet, instId=0)[-1].node slow_node.clientIbStasher.delay(req_delay()) sdk_ensure_upgrade_sent(looper, sdk_pool_handle, sdk_wallet_trustee, validUpgradeExpForceTrue) looper.run( eventually(checkUpgradeScheduled, [slow_node], validUpgradeExpForceTrue[VERSION], retryWait=1, timeout=waits.expectedUpgradeScheduled())) slow_node.clientIbStasher.reset_delays_and_process_delayeds() looper.runFor(waits.expectedUpgradeScheduled()) checkUpgradeScheduled([slow_node], validUpgradeExpForceTrue[VERSION]) assert len(list(slow_node.upgrader._actionLog)) == 1 assert slow_node.upgrader._actionLog.lastEvent[1] == \ UpgradeLog.SCHEDULED
def testOrderingCase1(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Scenario -> PRE-PREPARE not received by the replica, Request not received for ordering by the replica, but received enough commits to start ordering. It queues up the request so when a PRE-PREPARE is received or request is receievd for ordering, an order can be triggered https://www.pivotaltracker.com/story/show/125239401 Reproducing by - Pick a node with no primary replica, replica ignores forwarded request to replica and delay reception of PRE-PREPARE sufficiently so that enough COMMITs reach to trigger ordering. """ delay = 10 replica = getNonPrimaryReplicas(txnPoolNodeSet, instId=0)[0] delaysPrePrepareProcessing(replica.node, delay=delay, instId=0) def doNotProcessReqDigest(self, _): pass patchedMethod = types.MethodType(doNotProcessReqDigest, replica) replica.processRequest = patchedMethod def chk(n): assert replica.spylog.count(replica.doOrder.__name__) == n sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) timeout = delay - 5 looper.run(eventually(chk, 0, retryWait=1, timeout=timeout)) timeout = delay + 5 looper.run(eventually(chk, 1, retryWait=1, timeout=timeout))
def split_nodes(nodes): primary_node = get_master_primary_node(nodes) slow_node = getNonPrimaryReplicas(nodes, 0)[-1].node other_nodes = [n for n in nodes if n != slow_node] other_non_primary_nodes = [n for n in nodes if n not in (slow_node, primary_node)] return slow_node, other_nodes, primary_node, other_non_primary_nodes
def test_node_handles_forced_upgrade_on_propagate(looper, nodeSet, sdk_pool_handle, sdk_wallet_trustee, validUpgradeExpForceTrue): """ Verifies that POOL_UPGRADE force=true request is handled immediately when the node receives it in a PROPAGATE from any other node """ slow_node = getNonPrimaryReplicas(nodeSet, instId=0)[-1].node # Stash all except PROPAGATEs from Gamma slow_node.clientIbStasher.delay(req_delay()) slow_node.nodeIbStasher.delay(ppgDelay(sender_filter='Alpha')) slow_node.nodeIbStasher.delay(ppgDelay(sender_filter='Beta')) slow_node.nodeIbStasher.delay(ppDelay()) slow_node.nodeIbStasher.delay(pDelay()) slow_node.nodeIbStasher.delay(cDelay()) sdk_send_upgrade(looper, sdk_pool_handle, sdk_wallet_trustee, validUpgradeExpForceTrue) looper.run( eventually(checkUpgradeScheduled, [slow_node], validUpgradeExpForceTrue[VERSION], retryWait=1, timeout=waits.expectedUpgradeScheduled()))
def test_commits_recvd_first(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 50 slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=20, num_batches=4) assert not slow_node.master_replica._ordering_service.prePrepares assert not slow_node.master_replica._ordering_service.prepares assert not slow_node.master_replica._ordering_service.commits assert len(slow_node.master_replica._ordering_service. commitsWaitingForPrepare) > 0 slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) assert check_if_all_equal_in_list( [n.master_replica._ordering_service.ordered for n in txnPoolNodeSet]) assert slow_node.master_replica._ordering_service.prePrepares assert slow_node.master_replica._ordering_service.prepares assert slow_node.master_replica._ordering_service.commits assert not slow_node.master_replica._ordering_service.commitsWaitingForPrepare
def test_setup_last_ordered_for_non_master_without_catchup(txnPoolNodeSet, sdk_wallet_client): inst_id = 1 last_ordered_3pc = (0, 12) timestamp = time.time() ppSeqNo = 16 replica = getNonPrimaryReplicas(txnPoolNodeSet, inst_id)[-1] replica.last_ordered_3pc = last_ordered_3pc replica.preparesWaitingForPrePrepare.clear() replica.prePreparesPendingPrevPP.clear() preprepare, prepare = \ _create_prepare_and_preprepare(inst_id, replica.viewNo, ppSeqNo, timestamp, sdk_wallet_client) replica.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] = deque() replica.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] \ .append((preprepare, replica.primaryName)) replica.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] = deque() for node in txnPoolNodeSet: replica.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] \ .append((prepare, node.name)) replica._setup_last_ordered_for_non_master() assert replica.last_ordered_3pc == last_ordered_3pc
def test_delay_commits_for_one_node(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, slow_node_is_next_primary, vc_counts): current_view_no = checkViewNoForNodes(txnPoolNodeSet) excepted_view_no = current_view_no + 1 if vc_counts == 'once' else current_view_no + 2 next_primary = get_next_primary_name(txnPoolNodeSet, excepted_view_no) pretenders = [ r.node for r in getNonPrimaryReplicas(txnPoolNodeSet) if not r.isPrimary ] if slow_node_is_next_primary: delayed_node = [n for n in pretenders if n.name == next_primary][0] else: delayed_node = [n for n in pretenders if n.name != next_primary][0] with delay_rules_without_processing(delayed_node.nodeIbStasher, cDelay()): sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2) trigger_view_change(txnPoolNodeSet, excepted_view_no) if vc_counts == 'twice': trigger_view_change(txnPoolNodeSet, excepted_view_no) ensureElectionsDone(looper, txnPoolNodeSet, customTimeout=30) sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_primary_recvs_3phase_message_outside_watermarks(perf_chk_patched, chkFreqPatched, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_logsize): """ One of the primary starts getting lot of requests, more than his log size and queues up requests since they will go beyond its watermarks. This happens since other nodes are slow in processing its PRE-PREPARE. Eventually this primary will send PRE-PREPARE for all requests and those requests will complete """ tconf = perf_chk_patched delay = 2 instId = 0 reqs_to_send = 2 * reqs_for_logsize + 1 logger.debug('Will send {} requests'.format(reqs_to_send)) npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) pr = getPrimaryReplica(txnPoolNodeSet, instId) orderedCount = pr.stats.get(TPCStat.OrderSent) for r in npr: r.node.nodeIbStasher.delay(ppDelay(delay, instId)) r.node.nodeIbStasher.delay(pDelay(delay, instId)) tm_exec_1_batch = waits.expectedTransactionExecutionTime(len(txnPoolNodeSet)) batch_count = math.ceil(reqs_to_send / tconf.Max3PCBatchSize) total_timeout = (tm_exec_1_batch + delay) * batch_count def chk(): assert orderedCount + batch_count == pr.stats.get(TPCStat.OrderSent) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_to_send) looper.run(eventually(chk, retryWait=1, timeout=total_timeout))
def testNonPrimarySendsAPrePrepare(looper, nodeSet, setup, propagated1): primaryReplica = getPrimaryReplica(nodeSet, instId) nonPrimaryReplicas = getNonPrimaryReplicas(nodeSet, instId) firstNpr = nonPrimaryReplicas[0] remainingNpr = nonPrimaryReplicas[1:] def sendPrePrepareFromNonPrimary(replica): firstNpr.doPrePrepare(propagated1.reqDigest) return PrePrepare( replica.instId, firstNpr.viewNo, firstNpr.lastPrePrepareSeqNo, propagated1.identifier, propagated1.reqId, propagated1.digest, time.time()) ppr = sendPrePrepareFromNonPrimary(firstNpr) def chk(): for r in (primaryReplica, *remainingNpr): recvdPps = recvdPrePrepare(r) assert len(recvdPps) == 1 assert recvdPps[0]['pp'][:-1] == ppr[:-1] nodeSuspicions = len(getNodeSuspicions( r.node, Suspicions.PPR_FRM_NON_PRIMARY.code)) assert nodeSuspicions == 1 looper.run(eventually(chk, retryWait=.5, timeout=5))
def setup(tconf, looper, txnPoolNodeSet, client, wallet1): # Patch the 3phase request sending method to send incorrect digest and pr, otherR = getPrimaryReplica(txnPoolNodeSet, instId=0), \ getNonPrimaryReplicas(txnPoolNodeSet, instId=0) reqs = sendRandomRequests(wallet1, client, tconf.Max3PCBatchSize) waitForSufficientRepliesForRequests( looper, client, requests=reqs, customTimeoutPerReq=tconf.Max3PCBatchWait) stateRoot = pr.stateRootHash(DOMAIN_LEDGER_ID, to_str=False) origMethod = pr.create3PCBatch malignedOnce = None def badMethod(self, ledgerId): nonlocal malignedOnce pp = origMethod(ledgerId) if not malignedOnce: pp = updateNamedTuple(pp, digest=pp.digest + '123') malignedOnce = True return pp pr.create3PCBatch = types.MethodType(badMethod, pr) sendRandomRequests(wallet1, client, tconf.Max3PCBatchSize) return pr, otherR, stateRoot
def testElectionsAfterViewChange(delayedPerf, looper: Looper, nodeSet: TestNodeSet, up, wallet1, client1): """ Test that a primary election does happen after a view change """ # Delay processing of PRE-PREPARE from all non primary replicas of master # so master's throughput falls # and view changes nonPrimReps = getNonPrimaryReplicas(nodeSet, 0) for r in nonPrimReps: r.node.nodeIbStasher.delay(ppDelay(10, 0)) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 4) # Ensure view change happened for both node and its primary elector for node in nodeSet: looper.run( eventually(partial(checkViewChangeInitiatedForNode, node, 1), retryWait=1, timeout=20)) # Ensure elections are done again and pool is setup again with appropriate # protocol instances and each protocol instance is setup properly too checkProtocolInstanceSetup(looper, nodeSet, retryWait=1, timeout=30)
def test_setup_last_ordered_for_non_master_without_quorum_of_prepares( txnPoolNodeSet, sdk_wallet_client): inst_id = 1 replica = getNonPrimaryReplicas(txnPoolNodeSet, inst_id)[-1] replica._ordering_service.preparesWaitingForPrePrepare.clear() replica._ordering_service.prePreparesPendingPrevPP.clear() replica.last_ordered_3pc = (0, 0) timestamp = time.time() ppSeqNo = 5 preprepare, prepare = \ _create_prepare_and_preprepare(inst_id, replica.viewNo, ppSeqNo, timestamp, sdk_wallet_client) replica._ordering_service.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] = deque() replica._ordering_service.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] \ .append((preprepare, replica.primaryName)) replica._ordering_service.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] = deque() replica._ordering_service.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] \ .append((prepare, txnPoolNodeSet[-1].name)) replica._ordering_service.l_setup_last_ordered_for_non_master() assert replica.last_ordered_3pc == (0, 0)
def testNonPrimarySendsAPrePrepare(looper, nodeSet, setup, propagated1): nonPrimaryReplicas = getNonPrimaryReplicas(nodeSet, instId) firstNpr = nonPrimaryReplicas[0] remainingNpr = nonPrimaryReplicas[1:] def sendPrePrepareFromNonPrimary(): firstNpr.requestQueues[DOMAIN_LEDGER_ID].add(propagated1.key) ppReq = firstNpr.create3PCBatch(DOMAIN_LEDGER_ID) firstNpr.sendPrePrepare(ppReq) return ppReq ppr = sendPrePrepareFromNonPrimary() def chk(): for r in remainingNpr: recvdPps = recvd_pre_prepares(r) assert len(recvdPps) == 1 assert compareNamedTuple(recvdPps[0], ppr, f.DIGEST.nm, f.STATE_ROOT.nm, f.TXN_ROOT.nm) nodeSuspicions = len( getNodeSuspicions(r.node, Suspicions.PPR_FRM_NON_PRIMARY.code)) assert nodeSuspicions == 1 timeout = waits.expectedClientRequestPropagationTime(len(nodeSet)) looper.run(eventually(chk, retryWait=.5, timeout=timeout))
def testPrimarySendsAPrepareAndMarkedSuspicious(looper, nodeSet, delay_commits, preprepared1): def sendPrepareFromPrimary(instId): primary = getPrimaryReplica(nodeSet, instId) viewNo, ppSeqNo = next(iter(primary.sentPrePrepares.keys())) ppReq = primary.sentPrePrepares[viewNo, ppSeqNo] prepare = Prepare(instId, viewNo, ppSeqNo, ppReq.digest, ppReq.stateRootHash, ppReq.txnRootHash) primary.doPrepare(prepare) def chk(): for r in getNonPrimaryReplicas(nodeSet, instId): l = len([ param for param in getAllArgs(r, r.processPrepare) if param['sender'] == primary.name ]) assert l == 1 looper.run(eventually(chk)) sendPrepareFromPrimary(0) for node in nodeSet: if node in getNonPrimaryReplicas(nodeSet, 0): frm, reason, code = getAllArgs(node, TestNode.reportSuspiciousNode) assert frm == getPrimaryReplica(nodeSet, 0).node.name assert isinstance(reason, SuspiciousNode) assert len(getNodeSuspicions(node, Suspicions.PR_FRM_PRIMARY.code)) == 10
def setup(txnPoolNodeSet): primaryRep, nonPrimaryReps = getPrimaryReplica(txnPoolNodeSet, 0), \ getNonPrimaryReplicas(txnPoolNodeSet, 0) faultyRep = nonPrimaryReps[0] makeNodeFaulty( faultyRep.node, partial(sendDuplicate3PhaseMsg, msgType=Commit, count=3, instId=0)) # The node of the primary replica above should not be blacklisted by any # other node since we are simulating multiple COMMIT messages and # want to check for a particular suspicion whitelistNode(faultyRep.node.name, [node for node in txnPoolNodeSet if node != faultyRep.node], Suspicions.DUPLICATE_CM_SENT.code) # If the request is ordered then COMMIT will be rejected much earlier for r in [primaryRep, *nonPrimaryReps]: def do_nothing(self, commit): pass r.doOrder = types.MethodType(do_nothing, r) return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps, faultyRep=faultyRep)
def test_setup_last_ordered_for_non_master_without_catchup( txnPoolNodeSet, sdk_wallet_client): inst_id = 1 last_ordered_3pc = (0, 12) timestamp = time.time() ppSeqNo = 16 replica = getNonPrimaryReplicas(txnPoolNodeSet, inst_id)[-1] replica.last_ordered_3pc = last_ordered_3pc replica.preparesWaitingForPrePrepare.clear() replica.prePreparesPendingPrevPP.clear() preprepare, prepare = \ _create_prepare_and_preprepare(inst_id, replica.viewNo, ppSeqNo, timestamp, sdk_wallet_client) replica.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] = deque() replica.prePreparesPendingPrevPP[replica.viewNo, ppSeqNo] \ .append((preprepare, replica.primaryName)) replica.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] = deque() for node in txnPoolNodeSet: replica.preparesWaitingForPrePrepare[replica.viewNo, ppSeqNo] \ .append((prepare, node.name)) replica._setup_last_ordered_for_non_master() assert replica.last_ordered_3pc == last_ordered_3pc
def setup(txnPoolNodeSet): primaryRep, nonPrimaryReps = getPrimaryReplica(txnPoolNodeSet, 0), \ getNonPrimaryReplicas(txnPoolNodeSet, 0) faultyRep = nonPrimaryReps[0] makeNodeFaulty(faultyRep.node, partial(sendDuplicate3PhaseMsg, msgType=Commit, count=3, instId=0)) # The node of the primary replica above should not be blacklisted by any # other node since we are simulating multiple COMMIT messages and # want to check for a particular suspicion whitelistNode(faultyRep.node.name, [node for node in txnPoolNodeSet if node != faultyRep.node], Suspicions.DUPLICATE_CM_SENT.code) # If the request is ordered then COMMIT will be rejected much earlier for r in [primaryRep, *nonPrimaryReps]: def do_nothing(self, commit): pass r.doOrder = types.MethodType(do_nothing, r) return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps, faultyRep=faultyRep)
def testPrePrepareWithHighSeqNo(looper, txnPoolNodeSet, propagated1): def chk(): for r in getNonPrimaryReplicas(txnPoolNodeSet, instId): nodeSuspicions = len(getNodeSuspicions( r.node, Suspicions.WRONG_PPSEQ_NO.code)) assert nodeSuspicions == 1 def checkPreprepare(replica, viewNo, ppSeqNo, req, numOfPrePrepares): assert (replica.prePrepares[viewNo, ppSeqNo][0]) == \ (req.identifier, req.reqId, req.digest) primary = getPrimaryReplica(txnPoolNodeSet, instId) nonPrimaryReplicas = getNonPrimaryReplicas(txnPoolNodeSet, instId) req = propagated1.reqDigest primary.doPrePrepare(req) timeout = waits.expectedPrePrepareTime(len(txnPoolNodeSet)) for np in nonPrimaryReplicas: looper.run( eventually(checkPreprepare, np, primary.viewNo, primary.lastPrePrepareSeqNo - 1, req, 1, retryWait=.5, timeout=timeout)) newReqDigest = (req.identifier, req.reqId + 1, req.digest) incorrectPrePrepareReq = PrePrepare(instId, primary.viewNo, primary.lastPrePrepareSeqNo + 2, *newReqDigest, get_utc_epoch()) primary.send(incorrectPrePrepareReq, TPCStat.PrePrepareSent) timeout = waits.expectedPrePrepareTime(len(txnPoolNodeSet)) looper.run(eventually(chk, retryWait=1, timeout=timeout))
def testPrePrepareWithHighSeqNo(looper, nodeSet, propagated1): def chk(): for r in getNonPrimaryReplicas(nodeSet, instId): nodeSuspicions = len(getNodeSuspicions(r.node, Suspicions.WRONG_PPSEQ_NO.code)) assert nodeSuspicions == 1 def checkPreprepare(replica, viewNo, ppSeqNo, req, numOfPrePrepares): assert (replica.prePrepares[viewNo, ppSeqNo][0]) == (req.identifier, req.reqId, req.digest) primary = getPrimaryReplica(nodeSet, instId) nonPrimaryReplicas = getNonPrimaryReplicas(nodeSet, instId) req = propagated1.reqDigest primary.doPrePrepare(req) for np in nonPrimaryReplicas: looper.run( eventually( checkPreprepare, np, primary.viewNo, primary.lastPrePrepareSeqNo - 1, req, 1, retryWait=0.5, timeout=10 ) ) newReqDigest = ReqDigest(req.identifier, req.reqId + 1, req.digest) incorrectPrePrepareReq = PrePrepare( instId, primary.viewNo, primary.lastPrePrepareSeqNo + 2, *newReqDigest, time.time() ) primary.send(incorrectPrePrepareReq, TPCStat.PrePrepareSent) looper.run(eventually(chk, retryWait=1, timeout=50))
def test_non_primary_accepts_pre_prepare_time(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ One of the non-primary has an in-correct clock so it thinks PRE-PREPARE has incorrect time """ sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) # send_reqs_to_nodes_and_verify_all_replies(looper, wallet1, client1, 2) # The replica having the bad clock confused_npr = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1] make_clock_faulty(confused_npr.node) old_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) old_susp_count = get_timestamp_suspicion_count(confused_npr.node) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, count=2) assert get_timestamp_suspicion_count(confused_npr.node) > old_susp_count new_acceptable_rvs = getAllReturnVals( confused_npr, confused_npr.is_pre_prepare_time_acceptable) # `is_pre_prepare_time_acceptable` first returned False then returned True assert [True, False, *old_acceptable_rvs] == new_acceptable_rvs
def setup(tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): # Patch the 3phase request sending method to send incorrect digest and pr, otherR = getPrimaryReplica(txnPoolNodeSet, instId=0), \ getNonPrimaryReplicas(txnPoolNodeSet, instId=0) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, tconf.Max3PCBatchSize) stateRoot = pr._ordering_service.get_state_root_hash(DOMAIN_LEDGER_ID, to_str=False) origMethod = pr._ordering_service.create_3pc_batch malignedOnce = None def badMethod(self, ledgerId): nonlocal malignedOnce pp = origMethod(ledgerId) if not malignedOnce: pp = updateNamedTuple(pp, digest=pp.digest + '123') malignedOnce = True return pp pr._ordering_service.create_3pc_batch = types.MethodType( badMethod, pr._ordering_service) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, tconf.Max3PCBatchSize) return pr, otherR, stateRoot
def testReplicasRejectSamePrePrepareMsg(looper, nodeSet, client1, wallet1): """ Replicas should not accept PRE-PREPARE for view "v" and prepare sequence number "n" if it has already accepted a request with view number "v" and sequence number "n" """ numOfNodes = 4 fValue = getMaxFailures(numOfNodes) request1 = sendRandomRequest(wallet1, client1) result1 = looper.run( eventually(checkSufficientRepliesRecvd, client1.inBox, request1.reqId, fValue, retryWait=1, timeout=5)) logger.debug("request {} gives result {}".format(request1, result1)) primaryRepl = getPrimaryReplica(nodeSet) logger.debug("Primary Replica: {}".format(primaryRepl)) logger.debug( "Decrementing the primary replica's pre-prepare sequence number by " "one...") primaryRepl.lastPrePrepareSeqNo -= 1 request2 = sendRandomRequest(wallet1, client1) looper.run( eventually(checkPrePrepareReqSent, primaryRepl, request2, retryWait=1, timeout=10)) nonPrimaryReplicas = getNonPrimaryReplicas(nodeSet) logger.debug("Non Primary Replicas: " + str(nonPrimaryReplicas)) prePrepareReq = PrePrepare(primaryRepl.instId, primaryRepl.viewNo, primaryRepl.lastPrePrepareSeqNo, wallet1.defaultId, request2.reqId, request2.digest, time.time()) logger.debug("""Checking whether all the non primary replicas have received the pre-prepare request with same sequence number""") looper.run( eventually(checkPrePrepareReqRecvd, nonPrimaryReplicas, prePrepareReq, retryWait=1, timeout=10)) logger.debug("""Check that none of the non primary replicas didn't send any prepare message " in response to the pre-prepare message""") for npr in nonPrimaryReplicas: with pytest.raises(AssertionError): looper.run( eventually(checkPrepareReqSent, npr, wallet1.defaultId, request2.reqId, retryWait=1, timeout=10))
def test_caught_up_for_current_view_check(looper, txnPoolNodeSet, client1, wallet1, client1Connected): """ One of the node experiences poor network and loses 3PC messages. It has to do multiple rounds of catchup to be caught up """ sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 3 * Max3PCBatchSize) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) nprs = getNonPrimaryReplicas(txnPoolNodeSet, 0) bad_node = nprs[-1].node other_nodes = [n for n in txnPoolNodeSet if n != bad_node] orig_method = bad_node.master_replica.dispatchThreePhaseMsg # Bad node does not process any 3 phase messages, equivalent to messages # being lost def bad_method(self, m, s): pass bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( bad_method, bad_node.master_replica) sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, 6 * Max3PCBatchSize) waitNodeDataInequality(looper, bad_node, *other_nodes) # Patch all nodes to return ConsistencyProof of a smaller ledger to the # bad node but only once, so that the bad_node needs to do catchup again. make_a_node_catchup_twice(bad_node, other_nodes, DOMAIN_LEDGER_ID, Max3PCBatchSize) def is_catchup_needed_count(): return len( getAllReturnVals(bad_node, bad_node.is_catchup_needed, compare_val_to=True)) def caught_up_for_current_view_count(): return len( getAllReturnVals(bad_node, bad_node.caught_up_for_current_view, compare_val_to=True)) old_count_1 = is_catchup_needed_count() old_count_2 = caught_up_for_current_view_count() ensure_view_change(looper, txnPoolNodeSet) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) ensure_all_nodes_have_same_data(looper, nodes=txnPoolNodeSet) assert is_catchup_needed_count() > old_count_1 # The bad_node caught up due to receiving sufficient ViewChangeDone # messages assert caught_up_for_current_view_count() > old_count_2 bad_node.master_replica.dispatchThreePhaseMsg = types.MethodType( orig_method, bad_node.master_replica)
def setup(request, looper, txnPoolNodeSet, client1, wallet1, client1Connected): slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] slow_node.nodeIbStasher.delay(pDelay(100, 0)) slow_node.nodeIbStasher.delay(cDelay(100, 0)) if request.param == 'all': slow_node.nodeIbStasher.delay(ppDelay(100, 0)) return slow_node, fast_nodes
def split_nodes(nodes): primary_node = get_master_primary_node(nodes) slow_node = getNonPrimaryReplicas(nodes, 0)[-1].node other_nodes = [n for n in nodes if n != slow_node] other_non_primary_nodes = [ n for n in nodes if n not in (slow_node, primary_node) ] return slow_node, other_nodes, primary_node, other_non_primary_nodes
def testOrderingCase2(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Scenario -> A client sends requests, some nodes delay COMMITs to few specific nodes such some nodes achieve commit quorum later for those requests compared to other nodes. But all nodes `ORDER` request in the same order of ppSeqNos https://www.pivotaltracker.com/n/projects/1889887/stories/133655009 """ pr, replicas = getPrimaryReplica(txnPoolNodeSet, instId=0), \ getNonPrimaryReplicas(txnPoolNodeSet, instId=0) assert len(replicas) == 6 rep0 = pr rep1 = replicas[0] rep2 = replicas[1] rep3 = replicas[2] rep4 = replicas[3] rep5 = replicas[4] rep6 = replicas[5] node0 = rep0.node node1 = rep1.node node2 = rep2.node node3 = rep3.node node4 = rep4.node node5 = rep5.node node6 = rep6.node ppSeqsToDelay = 5 commitDelay = 3 # delay each COMMIT by this number of seconds delayedPpSeqNos = set() requestCount = 10 def specificCommits(wrappedMsg): nonlocal node3, node4, node5 msg, sender = wrappedMsg if isinstance(msg, PrePrepare): if len(delayedPpSeqNos) < ppSeqsToDelay: delayedPpSeqNos.add(msg.ppSeqNo) logger.debug('ppSeqNo {} be delayed'.format(msg.ppSeqNo)) if isinstance(msg, Commit) and msg.instId == 0 and \ sender in (n.name for n in (node3, node4, node5)) and \ msg.ppSeqNo in delayedPpSeqNos: return commitDelay for node in (node1, node2): logger.debug('{} would be delaying commits'.format(node)) node.nodeIbStasher.delay(specificCommits) sdk_reqs = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, requestCount) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet, custom_timeout=timeout) sdk_get_and_check_replies(looper, sdk_reqs)
def test_checkpoints_removed_on_backup_non_primary_replica_after_catchup( chkFreqPatched, txnPoolNodeSet, view_setup, clear_checkpoints): replica = getNonPrimaryReplicas(txnPoolNodeSet, 1)[-1] others = set(getAllReplicas(txnPoolNodeSet, 1)) - {replica} node = replica.node node.master_replica.last_ordered_3pc = (2, 12) replica.checkpoints[(6, 10)] = CheckpointState( seqNo=10, digests=[], digest='digest-6-10', receivedDigests={r.name: 'digest-6-10' for r in others}, isStable=True) replica.checkpoints[(11, 15)] = CheckpointState( seqNo=13, digests=['digest-11', 'digest-12', 'digest-13'], digest=None, receivedDigests={}, isStable=False) replica.stashedRecvdCheckpoints[2] = {} replica.stashedRecvdCheckpoints[2][(11, 15)] = {} for r in others: replica.stashedRecvdCheckpoints[2][(11, 15)][r.name] = \ Checkpoint(instId=1, viewNo=2, seqNoStart=11, seqNoEnd=15, digest='digest-11-15') replica.stashedRecvdCheckpoints[2][(16, 20)] = {} for r in others: replica.stashedRecvdCheckpoints[2][(16, 20)][r.name] = \ Checkpoint(instId=1, viewNo=2, seqNoStart=16, seqNoEnd=20, digest='digest-16-20') replica.stashedRecvdCheckpoints[2][(21, 25)] = {} replica.stashedRecvdCheckpoints[2][(21, 25)][next(iter(others)).name] = \ Checkpoint(instId=1, viewNo=2, seqNoStart=21, seqNoEnd=25, digest='digest-21-25') # Simulate catch-up completion node.ledgerManager.last_caught_up_3PC = (2, 20) node.allLedgersCaughtUp() assert len(replica.checkpoints) == 0 assert len(replica.stashedRecvdCheckpoints) == 0
def test_view_change_done_delayed(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ A node is slow so is behind other nodes, after view change, it catches up but it also gets view change message as delayed, a node should start participating only when caught up and ViewChangeCone quorum received. """ nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] slow_node = nprs[-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 10 delay_vcd = 25 delay_3pc_messages([slow_node], 0, delay_3pc) slow_node.nodeIbStasher.delay(vcd_delay(delay_vcd)) def chk(node): assert node.view_changer.has_acceptable_view_change_quorum assert node.view_changer._primary_verified assert node.isParticipating assert None not in {r.isPrimary for r in node.replicas.values()} sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5 * 4, 4) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the slow node successfully completes catchup waitNodeDataEquality(looper, slow_node, *other_nodes) # Other nodes complete view change, select primary and participate for node in other_nodes: looper.run(eventually(chk, node, retryWait=1)) # Since `ViewChangeCone` is delayed, slow_node is not able to select primary # and participate assert not slow_node.view_changer.has_acceptable_view_change_quorum assert not slow_node.view_changer._primary_verified assert not slow_node.isParticipating assert {r.isPrimary for r in slow_node.replicas.values()} == {None} # Send requests to make sure pool is functional sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) # Repair network slow_node.reset_delays_and_process_delayeds() # `slow_node` selects primary and participate looper.run(eventually(chk, slow_node, retryWait=1)) # Processes requests received during lack of primary waitNodeDataEquality(looper, slow_node, *other_nodes) # Send more requests and compare data of all nodes sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 5) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet)
def test_no_catchup_if_got_from_3pc(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ A node is slow to receive COMMIT messages so after a view change it starts catchup. But before it can start requesting txns, the COMMITs messages are received and are ordered. The node should not request any transactions. :return: """ send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay_cm = 30 delat_cp = 100 slow_node.nodeIbStasher.delay(cDelay(delay_cm)) # The slow node receives consistency proofs after some delay, this delay # gives the opportunity to deliver all 3PC messages slow_node.nodeIbStasher.delay(cpDelay(delat_cp)) # Count of `getCatchupReqs` which is called to construct the `CatchupReq` # to be sent def domain_cr_count(): return sum(1 for entry in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.getCatchupReqs) if entry.params['consProof'].ledgerId == DOMAIN_LEDGER_ID) old_count = domain_cr_count() sent_batches = 10 send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * sent_batches, sent_batches) ensure_view_change(looper, nodes=txnPoolNodeSet) # After view change, the `slow_node` is behind waitNodeDataInequality(looper, slow_node, *other_nodes) # Unstash only COMMIT messages slow_node.nodeIbStasher.reset_delays_and_process_delayeds(Commit.typename) looper.runFor(2) slow_node.nodeIbStasher.reset_delays_and_process_delayeds( ConsistencyProof.typename) waitNodeDataEquality(looper, slow_node, *other_nodes) # No `CatchupReq`s constructed, hence no `CatchupReq`s could have # been sent assert domain_cr_count() == old_count # Some stashed ordered requests have been processed rv = getAllReturnVals(slow_node, slow_node.processStashedOrderedReqs) assert sent_batches in rv sdk_ensure_pool_functional(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle)
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, wallet1, client1, client1Connected): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].elector.ledger_summary slow_nodes = [ r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)[-2:] ] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sendRandomRequests(wallet1, client1, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [ n.master_replica.last_prepared_certificate_in_view() for n in nodes ] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].elector.ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional ensure_pool_functional(looper, txnPoolNodeSet, wallet1, client1) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def test_view_change_after_max_catchup_rounds(txnPoolNodeSet, looper, sdk_pool_handle, sdk_wallet_client): """ The node should do only a fixed rounds of catchup. For this delay Prepares and Commits for 2 non-primary nodes by a large amount which is equivalent to loss of Prepares and Commits. Make sure 2 nodes have a different last prepared certificate from other two. Then do a view change, make sure view change completes and the pool does not process the request that were prepared by only a subset of the nodes """ sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * 3, 3) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ledger_summary = txnPoolNodeSet[0].ledger_summary slow_nodes = [r.node for r in getNonPrimaryReplicas( txnPoolNodeSet, 0)[-2:]] fast_nodes = [n for n in txnPoolNodeSet if n not in slow_nodes] # Make node slow to process Prepares and Commits for node in slow_nodes: node.nodeIbStasher.delay(pDelay(120, 0)) node.nodeIbStasher.delay(cDelay(120, 0)) sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 5) looper.runFor(3) ensure_view_change(looper, nodes=txnPoolNodeSet) def last_prepared(nodes): lst = [n.master_replica.last_prepared_certificate_in_view() for n in nodes] # All nodes have same last prepared assert check_if_all_equal_in_list(lst) return lst[0] last_prepared_slow = last_prepared(slow_nodes) last_prepared_fast = last_prepared(fast_nodes) # Check `slow_nodes` and `fast_nodes` set different last_prepared assert last_prepared_fast != last_prepared_slow # View change complete ensureElectionsDone(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) # The requests which were prepared by only a subset of the nodes were # not ordered assert txnPoolNodeSet[0].ledger_summary == ledger_summary for node in slow_nodes: node.nodeIbStasher.reset_delays_and_process_delayeds() # Make sure pool is functional sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 10, 2) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) ensure_all_nodes_have_same_data(looper, txnPoolNodeSet) last_prepared(txnPoolNodeSet)
def setup(nodeSet, up): primaryRep, nonPrimaryReps = getPrimaryReplica(nodeSet, 0), \ getNonPrimaryReplicas(nodeSet, 0) # The primary replica would send PRE-PREPARE messages with incorrect digest makeNodeFaulty(primaryRep.node, partial(send3PhaseMsgWithIncorrectDigest, msgType=PrePrepare)) return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps)
def testNonPrimaryRecvs3PhaseMessageOutsideWatermarks(chkFreqPatched, looper, txnPoolNodeSet, client1, wallet1, client1Connected, reqs_for_logsize): """ A node is slow in processing PRE-PREPAREs and PREPAREs such that lot of requests happen and the slow node has started getting 3 phase messages outside of it watermarks. Check that it queues up requests outside watermarks and once it has received stable checkpoint it processes more requests. It sends other nodes 3 phase messages older than their stable checkpoint so they should discard them. """ delay = 15 instId = 1 reqsToSend = reqs_for_logsize + 2 npr = getNonPrimaryReplicas(txnPoolNodeSet, instId) slowReplica = npr[0] slowNode = slowReplica.node slowNode.nodeIbStasher.delay(ppDelay(delay, instId)) slowNode.nodeIbStasher.delay(pDelay(delay, instId)) def discardCounts(replicas, pat): counts = {} for r in replicas: counts[r.name] = countDiscarded(r, pat) return counts oldStashCount = slowReplica.spylog.count( TestReplica.stashOutsideWatermarks.__name__) oldDiscardCounts = discardCounts( [n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') sendReqsToNodesAndVerifySuffReplies(looper, wallet1, client1, reqsToSend, 1) timeout = waits.expectedPoolGetReadyTimeout(len(txnPoolNodeSet)) looper.run( eventually(checkNodeDataForEquality, slowNode, *[_ for _ in txnPoolNodeSet if _ != slowNode], retryWait=1, timeout=timeout)) newStashCount = slowReplica.spylog.count( TestReplica.stashOutsideWatermarks.__name__) assert newStashCount > oldStashCount def chk(): counts = discardCounts( [n.replicas[instId] for n in txnPoolNodeSet if n != slowNode], 'achieved stable checkpoint') for nm, count in counts.items(): assert count > oldDiscardCounts[nm] timeout = waits.expectedNodeToNodeMessageDeliveryTime() * \ len(txnPoolNodeSet) + delay looper.run(eventually(chk, retryWait=1, timeout=timeout))
def test_discard_3PC_messages_for_already_ordered(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Nodes discard any 3PC messages for already ordered 3PC keys (view_no, pp_seq_no). Delay all 3PC messages to a node so it cannot respond to them unless the other nodes order them, now when the slow node will get them it will respond but other nodes will not process them and discard them """ slow_node = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)][-1] other_nodes = [n for n in txnPoolNodeSet if n != slow_node] delay = 20 delay_3pc_messages([slow_node], 0, delay) delay_3pc_messages([slow_node], 1, delay) sent_batches = 3 sdk_send_batches_of_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, num_reqs=2 * sent_batches, num_batches=sent_batches) # send_reqs_batches_and_get_suff_replies(looper, wallet1, client1, # 2 * sent_batches, sent_batches) def chk(node, inst_id, p_count, c_count): # A node will still record PREPRAREs even if more than n-f-1, till the # request is not ordered assert len(node.replicas[inst_id].prepares) >= p_count assert len(node.replicas[inst_id].commits) == c_count def count_discarded(inst_id, count): for node in other_nodes: assert countDiscarded(node.replicas[inst_id], 'already ordered 3 phase message') == count # `slow_node` did not receive any PREPAREs or COMMITs chk(slow_node, 0, 0, 0) # `other_nodes` have not discarded any 3PC message count_discarded(0, 0) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) slow_node.reset_delays_and_process_delayeds() waitNodeDataEquality(looper, slow_node, *other_nodes) # `slow_node` did receive correct number of PREPAREs and COMMITs looper.run(eventually(chk, slow_node, 0, sent_batches - 1, sent_batches, retryWait=1)) # `other_nodes` have not recorded any PREPAREs or COMMITs from `slow_node` chk_commits_prepares_recvd(0, other_nodes, slow_node) # `other_nodes` have discarded PREPAREs and COMMITs all batches count_discarded(0, 2 * sent_batches)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([ 1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID ]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def test_slow_nodes_catchup_before_selecting_primary_in_new_view( tconf, looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, one_node_added): """ Delay 3PC messages to one node and view change messages to some others (including primary) so the node that does not receive enough 3PC messages is behind but learns of the view change quickly and starts catchup. Other nodes learn of the view change late and thus keep on processing requests """ new_node = one_node_added nprs = [r.node for r in getNonPrimaryReplicas(txnPoolNodeSet, 0)] primary_node = getPrimaryReplica(txnPoolNodeSet, 0).node slow_node = nprs[-1] # nodes_slow_to_inst_chg = [primary_node] + nprs[:2] nodes_slow_to_inst_chg = [n for n in txnPoolNodeSet if n != slow_node] delay_3pc = 100 delay_ic = 5 sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) delay_3pc_messages([slow_node], 0, delay_3pc) for n in nodes_slow_to_inst_chg: n.nodeIbStasher.delay(icDelay(delay_ic)) def start_count(): return sum([1 for e in slow_node.ledgerManager.spylog.getAll( slow_node.ledgerManager.startCatchUpProcess.__name__) if e.params['ledgerId'] == DOMAIN_LEDGER_ID]) s = start_count() requests = sdk_send_random_requests(looper, sdk_pool_handle, sdk_wallet_client, 10 * Max3PCBatchSize) ensure_view_change(looper, nodes=txnPoolNodeSet, exclude_from_check=nodes_slow_to_inst_chg) sdk_get_and_check_replies(looper, requests) waitNodeDataEquality(looper, slow_node, *txnPoolNodeSet[:-1]) e = start_count() assert e - s >= 2 looper.run(eventually(checkViewNoForNodes, slow_node.viewNo)) checkProtocolInstanceSetup(looper, txnPoolNodeSet, retryWait=1) sdk_send_random_and_check(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, 2 * Max3PCBatchSize) waitNodeDataEquality(looper, new_node, *nodes_slow_to_inst_chg)
def setup(nodeSet, up): primaryRep = getPrimaryReplica(nodeSet, 0) nonPrimaryReps = getNonPrimaryReplicas(nodeSet, 0) faultyRep = nonPrimaryReps[0] makeNodeFaulty(faultyRep.node, partial(send3PhaseMsgWithIncorrectDigest, msgType=Commit, instId=0)) return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps, faultyRep=faultyRep)
def testReplicasRejectSamePrePrepareMsg(looper, nodeSet, client1, wallet1): """ Replicas should not accept PRE-PREPARE for view "v" and prepare sequence number "n" if it has already accepted a request with view number "v" and sequence number "n" """ numOfNodes = 4 fValue = getMaxFailures(numOfNodes) request1 = sendRandomRequest(wallet1, client1) result1 = looper.run( eventually(checkSufficientRepliesRecvd, client1.inBox, request1.reqId, fValue, retryWait=1, timeout=5)) logger.debug("request {} gives result {}".format(request1, result1)) primaryRepl = getPrimaryReplica(nodeSet) logger.debug("Primary Replica: {}".format(primaryRepl)) logger.debug( "Decrementing the primary replica's pre-prepare sequence number by " "one...") primaryRepl.lastPrePrepareSeqNo -= 1 request2 = sendRandomRequest(wallet1, client1) looper.run(eventually(checkPrePrepareReqSent, primaryRepl, request2, retryWait=1, timeout=10)) nonPrimaryReplicas = getNonPrimaryReplicas(nodeSet) logger.debug("Non Primary Replicas: " + str(nonPrimaryReplicas)) prePrepareReq = PrePrepare( primaryRepl.instId, primaryRepl.viewNo, primaryRepl.lastPrePrepareSeqNo, wallet1.defaultId, request2.reqId, request2.digest, time.time() ) logger.debug("""Checking whether all the non primary replicas have received the pre-prepare request with same sequence number""") looper.run(eventually(checkPrePrepareReqRecvd, nonPrimaryReplicas, prePrepareReq, retryWait=1, timeout=10)) logger.debug("""Check that none of the non primary replicas didn't send any prepare message " in response to the pre-prepare message""") for npr in nonPrimaryReplicas: with pytest.raises(AssertionError): looper.run(eventually(checkPrepareReqSent, npr, wallet1.defaultId, request2.reqId, retryWait=1, timeout=10))
def test_cannot_restore_last_sent_pp_seq_no_if_replica_not_primary( txnPoolNodeSet, view_no_set, setup): replica = getNonPrimaryReplicas(txnPoolNodeSet, instId=1)[0] node = replica.node assert node.viewNo == 2 can = node.last_sent_pp_store_helper._can_restore_last_sent_pp_seq_no( PrePrepareKey(inst_id=1, view_no=2, pp_seq_no=5)) assert can is False
def setup(request, looper, txnPoolNodeSet): slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node fast_nodes = [n for n in txnPoolNodeSet if n != slow_node] # Delay catchup reply so that the test gets time to make the check, # this delay is reset after the check slow_node.nodeIbStasher.delay(cr_delay(100)) slow_node.nodeIbStasher.delay(pDelay(100, 0)) slow_node.nodeIbStasher.delay(cDelay(100, 0)) if request.param == 'all': slow_node.nodeIbStasher.delay(ppDelay(100, 0)) return slow_node, fast_nodes
def setup(txnPoolNodeSet): primaryRep, nonPrimaryReps = getPrimaryReplica(txnPoolNodeSet, 0), \ getNonPrimaryReplicas(txnPoolNodeSet, 0) # A non primary replica sends PREPARE messages with incorrect digest faultyRep = nonPrimaryReps[0] makeNodeFaulty(faultyRep.node, partial(send3PhaseMsgWithIncorrectDigest, msgType=Prepare, instId=0)) return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps, faultyRep=faultyRep)
def test_lag_size_for_catchup( looper, chkFreqPatched, reqs_for_checkpoint, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client): """ Verifies that if the stored own checkpoints have aligned bounds then the master replica lag which makes the node perform catch-up is Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed received checkpoints. """ slow_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other_nodes = [n for n in txnPoolNodeSet if n != slow_node] # The master replica of the slow node stops to receive 3PC-messages slow_node.master_replica.threePhaseRouter.extend( ( (PrePrepare, lambda *x, **y: None), (Prepare, lambda *x, **y: None), (Commit, lambda *x, **y: None), ) ) completed_catchups_before_reqs = get_number_of_completed_catchups(slow_node) # Send requests for the slow node's master replica to get # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP quorumed stashed checkpoints # from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP * reqs_for_checkpoint) # Give time for the slow node to catch up if it is going to do it looper.runFor(waits.expectedPoolConsistencyProof(len(txnPoolNodeSet)) + waits.expectedPoolCatchupTime(len(txnPoolNodeSet))) checkNodeDataForInequality(slow_node, *other_nodes) # Verify that the slow node has not caught up assert get_number_of_completed_catchups(slow_node) == completed_catchups_before_reqs # Send more requests for the slow node's master replica to reach # Replica.STASHED_CHECKPOINTS_BEFORE_CATCHUP + 1 quorumed stashed # checkpoints from others send_reqs_batches_and_get_suff_replies(looper, txnPoolNodeSet, sdk_pool_handle, sdk_wallet_client, reqs_for_checkpoint) waitNodeDataEquality(looper, slow_node, *other_nodes) # Verify that the slow node has caught up assert get_number_of_completed_catchups(slow_node) > completed_catchups_before_reqs
def one_replica_and_others_in_backup_instance( request, txnPoolNodeSet, view_change_done): # NOTICE: This parametrized fixture triggers view change as pre-condition backup_inst_id = 1 primary = getPrimaryReplica(txnPoolNodeSet, backup_inst_id) non_primaries = getNonPrimaryReplicas(txnPoolNodeSet, backup_inst_id) if request.param == 'primary': return primary, non_primaries else: return non_primaries[0], [primary] + non_primaries[1:]
def setup(txnPoolNodeSet): primaryRep, nonPrimaryReps = getPrimaryReplica(txnPoolNodeSet, 0), \ getNonPrimaryReplicas(txnPoolNodeSet, 0) # The primary replica would send 3 duplicate PRE-PREPARE requests to # non primary replicas makeNodeFaulty(primaryRep.node, partial(sendDuplicate3PhaseMsg, msgType=PrePrepare, count=3)) # The node of the primary replica above should not be blacklisted by any # other node since we are simulating multiple PRE-PREPARE messages and # want to check for a particular suspicion return adict(primaryRep=primaryRep, nonPrimaryReps=nonPrimaryReps)
def testOrderingWhenPrePrepareNotReceived(looper, txnPoolNodeSet, sdk_wallet_client, sdk_pool_handle): """ Send commits but delay pre-prepare and prepares such that enough commits are received, now the request should not be ordered until pre-prepare is received and ordering should just happen once, """ delay = 10 non_prim_reps = getNonPrimaryReplicas(txnPoolNodeSet, 0) slow_rep = non_prim_reps[0] slow_node = slow_rep.node slow_node.nodeIbStasher.delay(ppDelay(delay, 0)) slow_node.nodeIbStasher.delay(pDelay(delay, 0)) stash_pp = [] stash_p = [] orig_pp_method = slow_rep.processPrePrepare orig_p_method = slow_rep.processPrepare def patched_pp(self, msg, sender): stash_pp.append((msg, sender)) def patched_p(self, msg, sender): stash_p.append((msg, sender)) slow_rep.processPrePrepare = \ types.MethodType(patched_pp, slow_rep) slow_rep.processPrepare = \ types.MethodType(patched_p, slow_rep) def chk1(): assert len(slow_rep.commitsWaitingForPrepare) > 0 sdk_send_random_request(looper, sdk_pool_handle, sdk_wallet_client) timeout = waits.expectedPrePrepareTime(len(txnPoolNodeSet)) + delay looper.run(eventually(chk1, retryWait=1, timeout=timeout)) for m, s in stash_pp: orig_pp_method(m, s) for m, s in stash_p: orig_p_method(m, s) def chk2(): assert len(slow_rep.commitsWaitingForPrepare) == 0 assert slow_rep.spylog.count(slow_rep.doOrder.__name__) == 1 timeout = waits.expectedOrderingTime(len(non_prim_reps) + 1) + 2 * delay looper.run(eventually(chk2, retryWait=1, timeout=timeout))
def setup(request, txnPoolNodeSet): # Test once when client request is received and once when not received # Choosing a faulty node which is primary in neither instance, this helps # in the that same PROPAGATEs are not requested again by the node faulty_node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[1].node if request.param == 'client_requests': # Long delay in PROPAGATEs faulty_node.nodeIbStasher.delay(ppgDelay(90)) return faulty_node, True if request.param == 'no_client_requests': # Long delay in PROPAGATEs faulty_node.nodeIbStasher.delay(ppgDelay(90)) # Long delay in Client Requests faulty_node.clientIbStasher.delay(req_delay(90)) return faulty_node, False
def break_backup_replica(txnPoolNodeSet): node = getNonPrimaryReplicas(txnPoolNodeSet, inst_id)[-1].node broken_replica = node.replicas[inst_id] non_broken_replica = node.replicas[0] def fakeProcessPrePrepare(pre_prepare, sender): logger.warning( "{} is broken. 'processPrePrepare' does nothing".format(broken_replica.name)) broken_replica.threePhaseRouter.extend( ( (PrePrepare, fakeProcessPrePrepare), ) ) return broken_replica, non_broken_replica
def broken_node_and_others(txnPoolNodeSet): node = getNonPrimaryReplicas(txnPoolNodeSet, 0)[-1].node other = [n for n in txnPoolNodeSet if n != node] def brokenSendToReplica(msg, frm): logger.warning( "{} is broken. 'sendToReplica' does nothing".format(node.name)) node.nodeMsgRouter.extend( ( (PrePrepare, brokenSendToReplica), (Prepare, brokenSendToReplica), (Commit, brokenSendToReplica), (Checkpoint, brokenSendToReplica), ) ) return node, other