def wait_for_seqno_to_commit(seqno, view, nodes): """ Wait for a specific seqno at a specific view to be committed on all nodes. """ for _ in range(infra.ccf.Network.replication_delay * 10): up_to_date_f = [] for f in nodes: with f.node_client() as c: r = c.get("tx", {"view": view, "seqno": seqno}) assert (r.status == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status}" status = TxStatus(r.result["status"]) if status == TxStatus.Committed: up_to_date_f.append(f.node_id) elif status == TxStatus.Invalid: raise RuntimeError( f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed" ) else: pass if len(up_to_date_f) == len(nodes): break time.sleep(0.1) assert len(up_to_date_f) == len( nodes), "Only {} out of {} nodes are up to date".format( len(up_to_date_f), len(nodes))
def wait_for_global_commit(node_client, seqno, view, mksign=False, timeout=3): """ Given a client to a CCF network and a seqno/view pair, this function waits for this specific commit index to be globally committed by the network in this view. A TimeoutError exception is raised if the commit index is not globally committed within the given timeout. """ # Waiting for a global commit can significantly slow down tests as # signatures take some time to be emitted and globally committed. # Forcing a signature accelerates this process for common operations # (e.g. governance proposals) if mksign: r = node_client.rpc("mkSign") if r.error is not None: raise RuntimeError(f"mkSign returned an error: {r.error}") end_time = time.time() + timeout while time.time() < end_time: r = node_client.get("tx", {"view": view, "seqno": seqno}) assert (r.status == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status}" status = TxStatus(r.result["status"]) if status == TxStatus.Committed: return elif status == TxStatus.Invalid: raise RuntimeError( f"Transaction ID {view}.{seqno} is marked invalid and will never be committed" ) else: time.sleep(0.1) raise TimeoutError("Timed out waiting for commit")
def test_isolate_and_reconnect_primary(network, args, **kwargs): primary, backups = network.find_nodes() with network.partitioner.partition(backups): lost_tx_resp = check_does_not_progress(primary) new_primary, _ = network.wait_for_new_primary(primary, nodes=backups, timeout_multiplier=6) new_tx_resp = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: try: # There will be at least one full election cycle for nothing, where the # re-joining node fails to get elected but causes others to rev up their # term. After that, a successful election needs to take place, and we # arbitrarily allow 3 time periods to avoid being too brittle when # raft timeouts line up badly. c.wait_for_commit(new_tx_resp, timeout=(network.election_duration * 4)) except TimeoutError: details = c.get("/node/consensus").body.json() assert ( False ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}" # Check it has dropped anything submitted while partitioned r = c.get( f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}" ) status = TxStatus(r.body.json()["status"]) assert status == TxStatus.Invalid, r
def wait_for_all_nodes_to_catch_up(self, primary, timeout=3): """ Wait for all nodes to have joined the network and globally replicated all transactions globally executed on the primary (including transactions which added the nodes). """ end_time = time.time() + timeout while time.time() < end_time: with primary.node_client() as c: resp = c.get("getCommit") commit_leader = resp.result["commit"] term_leader = resp.result["term"] if commit_leader != 0: break time.sleep(0.1) assert ( commit_leader != 0 ), f"Primary {primary.node_id} has not made any progress yet (term: {term_leader}, commit: {commit_leader})" while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.node_client() as c: resp = c.get( "tx", { "view": term_leader, "seqno": commit_leader }, ) if resp.error is not None: # Node may not have joined the network yet, try again break status = TxStatus(resp.result["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: raise RuntimeError( f"Node {node.node_id} reports transaction ID {term_leader}.{commit_leader} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_pending(client, view, seqno, timeout=3): end_time = time.time() + timeout while time.time() < end_time: r = client.get(f"/node/tx?transaction_id={view}.{seqno}") assert (r.status_code == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status_code}" status = TxStatus(r.body.json()["status"]) if status == TxStatus.Pending: return elif status == TxStatus.Invalid: raise RuntimeError( f"Transaction ID {view}.{seqno} is marked invalid and will never be committed" ) elif status == TxStatus.Committed: raise RuntimeError( f"Transaction ID {view}.{seqno} is unexpectedly marked committed" ) else: time.sleep(0.1) raise TimeoutError("Timed out waiting for commit")
def test_tx_statuses(network, args): primary, _ = network.find_primary() with primary.user_client() as c: check = infra.checker.Checker() r = c.rpc("log/private", {"id": 0, "msg": "Ignored"}) check(r) # Until this tx is globally committed, poll for the status of this and some other # related transactions around it (and also any historical transactions we're tracking) target_view = r.view target_seqno = r.seqno SentTxs.update_status(target_view, target_seqno) SentTxs.update_status(target_view, target_seqno + 1) SentTxs.update_status(target_view - 1, target_seqno, TxStatus.Invalid) end_time = time.time() + 10 while True: if time.time() > end_time: raise TimeoutError( f"Took too long waiting for global commit of {target_view}.{target_seqno}" ) done = False for view, seqno in SentTxs.get_all_tx_ids(): r = c.get("tx", {"view": view, "seqno": seqno}) check(r) status = TxStatus(r.result["status"]) SentTxs.update_status(view, seqno, status) if (status == TxStatus.Committed and target_view == view and target_seqno == seqno): done = True if done: break time.sleep(0.1) return network
def test_view_history(network, args): if args.consensus == "pbft": # This appears to work in PBFT, but it is unacceptably slow: # - Each /tx request is a write, with a non-trivial roundtrip response time # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID, # there are too many IDs to test exhaustively # We could rectify this by making this test non-exhaustive (bisecting for view changes, # sampling within a view), but for now it is exhaustive and Raft-only LOG.warning("Skipping view reconstruction in PBFT") return network check = infra.checker.Checker() for node in network.get_joined_nodes(): with node.user_client() as c: r = c.get("commit") check(c) commit_view = r.view commit_seqno = r.global_commit # Temporarily disable logging of RPCs for readability rpc_loggers = c.rpc_loggers c.rpc_loggers = () LOG.warning("RPC logging temporarily suppressed") # Retrieve status for all possible Tx IDs seqno_to_views = {} for seqno in range(1, commit_seqno + 1): views = [] for view in range(1, commit_view + 1): r = c.get("tx", {"view": view, "seqno": seqno}) check(r) status = TxStatus(r.result["status"]) if status == TxStatus.Committed: views.append(view) seqno_to_views[seqno] = views c.rpc_loggers = rpc_loggers LOG.warning("RPC logging restored") # Check we have exactly one Tx ID for each seqno txs_ok = True for seqno, views in seqno_to_views.items(): if len(views) != 1: txs_ok = False LOG.error( f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}" ) tx_ids_condensed = ", ".join( " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"]) for seqno, views in seqno_to_views.items()) if txs_ok: LOG.success( f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}" ) else: LOG.error( f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}" ) raise RuntimeError( f"Node {node.node_id}: Incomplete or inconsistent view history" ) return network