def wait_for_seqno_to_commit(seqno, view, nodes): """ Wait for a specific seqno at a specific view to be committed on all nodes. """ for _ in range(infra.network.Network.replication_delay * 10): up_to_date_f = [] logs = {} for f in nodes: with f.client() as c: logs[f.node_id] = [] r = c.get( f"/node/tx?view={view}&seqno={seqno}", log_capture=logs[f.node_id] ) assert ( r.status_code == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status_code}" status = TxStatus(r.body.json()["status"]) if status == TxStatus.Committed: up_to_date_f.append(f.node_id) elif status == TxStatus.Invalid: flush_info(logs[f.node_id], None, 0) raise RuntimeError( f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed" ) else: pass if len(up_to_date_f) == len(nodes): break time.sleep(0.1) for lines in logs.values(): flush_info(lines, None, 0) assert len(up_to_date_f) == len( nodes ), "Only {} out of {} nodes are up to date".format(len(up_to_date_f), len(nodes))
def wait_for_commit(client, seqno: int, view: int, timeout: int = 3) -> None: """ Waits for a specific seqno/view pair to be committed by the network, as per the node to which client is connected to. :param client: Instance of :py:class:`ccf.clients.CCFClient` :param int seqno: Transaction sequence number. :param int view: Consensus view. :param str timeout: Maximum time to wait for this seqno/view pair to be committed before giving up. A TimeoutError exception is raised if the commit index is not committed within the given timeout. """ end_time = time.time() + timeout while time.time() < end_time: r = client.get(f"/node/tx?view={view}&seqno={seqno}") assert (r.status_code == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status_code}" assert isinstance(r.body, dict), "/node/tx should return a JSON object" status = TxStatus(r.body["status"]) if status == TxStatus.Committed: return elif status == TxStatus.Invalid: raise RuntimeError( f"Transaction ID {view}.{seqno} is marked invalid and will never be committed" ) else: time.sleep(0.1) raise TimeoutError("Timed out waiting for commit")
def test_isolate_and_reconnect_primary(network, args, **kwargs): primary, backups = network.find_nodes() with network.partitioner.partition(backups): lost_tx_resp = check_does_not_progress(primary) new_primary, _ = network.wait_for_new_primary(primary, nodes=backups, timeout_multiplier=6) new_tx_resp = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: try: # There will be at least one full election cycle for nothing, where the # re-joining node fails to get elected but causes others to rev up their # term. After that, a successful election needs to take place, and we # arbitrarily allow 3 time periods to avoid being too brittle when # raft timeouts line up badly. c.wait_for_commit(new_tx_resp, timeout=(network.election_duration * 4)) except TimeoutError: details = c.get("/node/consensus").body.json() assert ( False ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}" # Check it has dropped anything submitted while partitioned r = c.get( f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}" ) status = TxStatus(r.body.json()["status"]) assert status == TxStatus.Invalid, r
def test_view_history(network, args): if args.consensus == "pbft": # This appears to work in PBFT, but it is unacceptably slow: # - Each /tx request is a write, with a non-trivial roundtrip response time # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID, # there are too many IDs to test exhaustively # We could rectify this by making this test non-exhaustive (bisecting for view changes, # sampling within a view), but for now it is exhaustive and Raft-only LOG.warning("Skipping view reconstruction in PBFT") return network check = infra.checker.Checker() for node in network.get_joined_nodes(): with node.client("user0") as c: r = c.get("/node/commit") check(c) commit_view = r.body["view"] commit_seqno = r.body["seqno"] # Retrieve status for all possible Tx IDs seqno_to_views = {} for seqno in range(1, commit_seqno + 1): views = [] for view in range(1, commit_view + 1): r = c.get(f"/node/tx?view={view}&seqno={seqno}") check(r) status = TxStatus(r.body["status"]) if status == TxStatus.Committed: views.append(view) seqno_to_views[seqno] = views # Check we have exactly one Tx ID for each seqno txs_ok = True for seqno, views in seqno_to_views.items(): if len(views) != 1: txs_ok = False LOG.error( f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}" ) tx_ids_condensed = ", ".join( " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"]) for seqno, views in seqno_to_views.items()) if txs_ok: LOG.success( f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}" ) else: LOG.error( f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}" ) raise RuntimeError( f"Node {node.node_id}: Incomplete or inconsistent view history" ) return network
def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10): """ Wait for all nodes to have joined the network and committed all transactions executed on the primary. """ if not (primary or tx_id): raise ValueError("Either a valid TxID or primary node should be specified") end_time = time.time() + timeout # If no TxID is specified, retrieve latest readable one if tx_id == None: while time.time() < end_time: with primary.client() as c: resp = c.get( "/node/network/nodes/self" ) # Well-known read-only endpoint tx_id = TxID(resp.view, resp.seqno) if tx_id.valid(): break time.sleep(0.1) assert ( tx_id.valid() ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})" caught_up_nodes = [] logs = {} while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: logs[node.node_id] = [] resp = c.get( f"/node/local_tx?transaction_id={tx_id}", log_capture=logs[node.node_id], ) if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: flush_info(logs[node.node_id], None, 0) raise RuntimeError( f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) for lines in logs.values(): flush_info(lines, None, 0) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_all_nodes_to_catch_up(self, primary, timeout=10): """ Wait for all nodes to have joined the network and globally replicated all transactions globally executed on the primary (including transactions which added the nodes). """ end_time = time.time() + timeout while time.time() < end_time: with primary.client() as c: resp = c.get("/node/commit") body = resp.body.json() seqno = body["seqno"] view = body["view"] if seqno != 0: break time.sleep(0.1) assert ( seqno != 0 ), f"Primary {primary.node_id} has not made any progress yet (view: {view}, seqno: {seqno})" caught_up_nodes = [] while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: c.get("/node/commit") resp = c.get(f"/node/local_tx?view={view}&seqno={seqno}") if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: raise RuntimeError( f"Node {node.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_commit(client, seqno: int, view: int, timeout: int = 3, log_capture: Optional[list] = None) -> None: """ Waits for a specific seqno/view pair to be committed by the network, as per the node to which client is connected to. :param client: Instance of :py:class:`ccf.clients.CCFClient` :param int seqno: Transaction sequence number. :param int view: Consensus view. :param str timeout: Maximum time to wait for this seqno/view pair to be committed before giving up. :param list log_capture: Rather than emit to default handler, capture log lines to list (optional). A TimeoutError exception is raised if the commit index is not committed within the given timeout. """ if view is None or seqno is None: raise ValueError(f"{view}.{seqno} is not a valid transaction ID") logs: List[str] = [] end_time = time.time() + timeout while time.time() < end_time: logs = [] r = client.get(f"/node/tx?transaction_id={view}.{seqno}", log_capture=logs) assert (r.status_code == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status_code}" status = TxStatus(r.body.json()["status"]) if status == TxStatus.Committed: flush_info(logs, log_capture, 1) return elif status == TxStatus.Invalid: raise RuntimeError( f"Transaction ID {view}.{seqno} is marked invalid and will never be committed" ) else: time.sleep(0.1) flush_info(logs, log_capture, 1) raise TimeoutError( f'Timed out waiting for commit: {pprint.pformat(client.get("/node/consensus").body.json())}' )
def wait_for_pending(client, view, seqno, timeout=3): end_time = time.time() + timeout while time.time() < end_time: r = client.get(f"/node/tx?transaction_id={view}.{seqno}") assert (r.status_code == http.HTTPStatus.OK ), f"tx request returned HTTP status {r.status_code}" status = TxStatus(r.body.json()["status"]) if status == TxStatus.Pending: return elif status == TxStatus.Invalid: raise RuntimeError( f"Transaction ID {view}.{seqno} is marked invalid and will never be committed" ) elif status == TxStatus.Committed: raise RuntimeError( f"Transaction ID {view}.{seqno} is unexpectedly marked committed" ) else: time.sleep(0.1) raise TimeoutError("Timed out waiting for commit")
def test_tx_statuses(network, args): primary, _ = network.find_primary() with primary.client("user0") as c: check = infra.checker.Checker() r = c.post("/app/log/private", {"id": 0, "msg": "Ignored"}) check(r) # Until this tx is globally committed, poll for the status of this and some other # related transactions around it (and also any historical transactions we're tracking) target_view = r.view target_seqno = r.seqno SentTxs.update_status(target_view, target_seqno) SentTxs.update_status(target_view, target_seqno + 1) SentTxs.update_status(target_view - 1, target_seqno) end_time = time.time() + 10 while True: if time.time() > end_time: raise TimeoutError( f"Took too long waiting for global commit of {target_view}.{target_seqno}" ) done = False for view, seqno in SentTxs.get_all_tx_ids(): r = c.get(f"/node/tx?view={view}&seqno={seqno}") check(r) status = TxStatus(r.body.json()["status"]) SentTxs.update_status(view, seqno, status) if ( status == TxStatus.Committed and target_view == view and target_seqno == seqno ): done = True if done: break time.sleep(0.1) return network
def test_view_history(network, args): if args.consensus == "bft": # This appears to work in BFT, but it is unacceptably slow: # - Each /tx request is a write, with a non-trivial roundtrip response time # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID, # there are too many IDs to test exhaustively # We could rectify this by making this test non-exhaustive (bisecting for view changes, # sampling within a view), but for now it is exhaustive and Raft-only LOG.warning("Skipping view reconstruction in BFT") return network check = infra.checker.Checker() previous_node = None previous_tx_ids = "" for node in network.get_joined_nodes(): with node.client("user0") as c: r = c.get("/node/commit") check(c) commit_tx_id = TxID.from_str(r.body.json()["transaction_id"]) # Retrieve status for all possible Tx IDs seqno_to_views = {} for seqno in range(1, commit_tx_id.seqno + 1): views = [] for view in range(1, commit_tx_id.view + 1): r = c.get(f"/node/tx?transaction_id={view}.{seqno}", log_capture=[]) check(r) status = TxStatus(r.body.json()["status"]) if status == TxStatus.Committed: views.append(view) seqno_to_views[seqno] = views # Check we have exactly one Tx ID for each seqno txs_ok = True for seqno, views in seqno_to_views.items(): if len(views) != 1: txs_ok = False LOG.error( f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}" ) tx_ids_condensed = ", ".join( " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"]) for seqno, views in seqno_to_views.items() ) if txs_ok: LOG.success( f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}" ) else: LOG.error( f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}" ) raise RuntimeError( f"Node {node.node_id}: Incomplete or inconsistent view history" ) # Compare view history between nodes if previous_tx_ids: # Some nodes may have a slightly longer view history so only compare the common prefix min_tx_ids_len = min(len(previous_tx_ids), len(tx_ids_condensed)) assert ( tx_ids_condensed[:min_tx_ids_len] == previous_tx_ids[:min_tx_ids_len] ), f"Tx IDs don't match between node {node.node_id} and node {previous_node.node_id}: {tx_ids_condensed[:min_tx_ids_len]} and {previous_tx_ids[:min_tx_ids_len]}" previous_tx_ids = tx_ids_condensed previous_node = node return network