def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10): """ Wait for all nodes to have joined the network and committed all transactions executed on the primary. """ if not (primary or tx_id): raise ValueError("Either a valid TxID or primary node should be specified") end_time = time.time() + timeout # If no TxID is specified, retrieve latest readable one if tx_id == None: while time.time() < end_time: with primary.client() as c: resp = c.get( "/node/network/nodes/self" ) # Well-known read-only endpoint tx_id = TxID(resp.view, resp.seqno) if tx_id.valid(): break time.sleep(0.1) assert ( tx_id.valid() ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})" caught_up_nodes = [] logs = {} while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: logs[node.node_id] = [] resp = c.get( f"/node/local_tx?transaction_id={tx_id}", log_capture=logs[node.node_id], ) if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: flush_info(logs[node.node_id], None, 0) raise RuntimeError( f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) for lines in logs.values(): flush_info(lines, None, 0) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def check_can_progress(node, timeout=3): with node.client() as c: r = c.get("/node/commit") original_tx = TxID.from_str(r.body.json()["transaction_id"]) with node.client("user0") as uc: uc.post("/app/log/private", {"id": 42, "msg": "Hello world"}) end_time = time.time() + timeout while time.time() < end_time: current_tx = TxID.from_str( c.get("/node/commit").body.json()["transaction_id"]) if current_tx.seqno > original_tx.seqno: return current_tx time.sleep(0.1) assert False, f"Stuck at {r}"
def get_latest_ledger_public_state(self, timeout=5): primary, _ = self.find_primary() with primary.client() as nc: resp = nc.get("/node/commit") body = resp.body.json() tx_id = TxID.from_str(body["transaction_id"]) return self._get_ledger_public_view_at( primary, primary.get_ledger_public_state_at, tx_id.seqno, timeout)
def process_next(self): with self.primary.client() as client: rv = client.get("/node/commit") tx_id = TxID.from_str(rv.body.json()["transaction_id"]) more_to_process = self.commit != tx_id.seqno self.commit = tx_id.seqno return more_to_process
def from_requests_response(rr): tx_id = TxID.from_str(rr.headers.get(CCF_TX_ID_HEADER)) return Response( status_code=rr.status_code, body=RequestsResponseBody(rr), seqno=tx_id.seqno, view=tx_id.view, headers=rr.headers, )
def run(args): with infra.service_load.load() as load: with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, service_load=load, ) as network: check = infra.checker.Checker() network.start_and_open(args) current_view = None primary, current_view = network.find_primary() # Number of nodes F to stop until network cannot make progress nodes_to_stop = math.ceil(len(args.nodes) / 2) if args.consensus == "BFT": nodes_to_stop = math.ceil(len(args.nodes) / 3) primary_is_known = True for node_to_stop in range(nodes_to_stop): primary, current_view = network.find_primary() LOG.debug( "Commit new transactions, primary:{}, current_view:{}". format(primary.local_node_id, current_view)) with primary.client("user0") as c: res = c.post( "/app/log/private", { "id": current_view, "msg": "This log is committed in view {}".format( current_view), }, ) check(res, result=True) LOG.debug( "Waiting for transaction to be committed by all nodes") network.wait_for_all_nodes_to_commit( tx_id=TxID(res.view, res.seqno)) try: test_kill_primary_no_reqs(network, args) except PrimaryNotFound: if node_to_stop < nodes_to_stop - 1: raise else: primary_is_known = False assert not primary_is_known, "Primary is still known" LOG.success("Test ended successfully.")
def last_verified_txid(self) -> TxID: """ Return the :py:class:`ccf.tx_id.TxID` of the last verified signature transaction in the *parsed* ledger. Note: The ledger should first be parsed before calling this function. :return: :py:class:`ccf.tx_id.TxID` """ return TxID( self._ledger_validator.last_verified_view, self._ledger_validator.last_verified_seqno, )
def run(args): with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb ) as network: check = infra.checker.Checker() network.start_and_join(args) current_view = None # Number of nodes F to stop until network cannot make progress nodes_to_stop = math.ceil(len(args.nodes) / 2) if args.consensus == "bft": nodes_to_stop = math.ceil(len(args.nodes) / 3) primary_is_known = True for node_to_stop in range(nodes_to_stop): # Note that for the first iteration, the primary is known in advance anyway LOG.debug("Find freshly elected primary") # After a view change in bft, finding the new primary takes longer primary, current_view = network.find_primary( timeout=(30 if args.consensus == "bft" else 3) ) LOG.debug( "Commit new transactions, primary:{}, current_view:{}".format( primary.node_id, current_view ) ) with primary.client("user0") as c: res = c.post( "/app/log/private", { "id": current_view, "msg": "This log is committed in view {}".format(current_view), }, ) check(res, result=True) LOG.debug("Waiting for transaction to be committed by all nodes") network.wait_for_all_nodes_to_commit(tx_id=TxID(res.view, res.seqno)) try: test_kill_primary(network, args) except PrimaryNotFound: if node_to_stop < nodes_to_stop - 1: raise else: primary_is_known = False assert not primary_is_known, "Primary is still known" LOG.success("Test ended successfully.")
def test_nobuiltins_endpoints(network, args): primary, backups = network.find_nodes() with primary.client() as c: r = c.get("/app/commit") assert r.status_code == HTTPStatus.OK body_j = r.body.json() tx_id = TxID.from_str(body_j["transaction_id"]) r = c.get("/app/node_summary") assert r.status_code == HTTPStatus.OK body_j = r.body.json() assert body_j["committed_view"] == tx_id.view assert body_j["committed_seqno"] == tx_id.seqno assert body_j["quote_format"] == "OE_SGX_v1" assert body_j["node_id"] == primary.node_id r = c.get("/app/api") assert r.status_code == HTTPStatus.OK openapi_spec_validator.validate_spec(r.body.json()) r = c.get(f"/app/tx_id?seqno={tx_id.seqno}") assert r.status_code == HTTPStatus.OK body_j = r.body.json() assert body_j["transaction_id"] == f"{tx_id}" for i in range(3): if i != 0: time.sleep(1.5) r = c.get("/app/current_time") local_time = datetime.now(timezone.utc) assert r.status_code == HTTPStatus.OK body_j = r.body.json() service_time = datetime.fromisoformat(body_j["timestamp"]) diff = (local_time - service_time).total_seconds() # This intends to test that the reported time is "close enough" # to the real current time. This is dependent on the skew between # clocks on this executor and the target node, and the request # latency (including Python IO and parsing). It may need to be # more lenient assert abs(diff) < 1, diff r = c.get("/app/all_nodes") assert r.status_code == HTTPStatus.OK body_j = r.body.json() known_node_ids = [node.node_id for node in (primary, *backups)] for node_id, node_info in body_j["nodes"].items(): assert ( node_id in known_node_ids ), f"Response contains '{node_id}', which is not in known IDs: {known_node_ids}" assert node_info["quote_format"] == "OE_SGX_v1"
def wait_for_all_nodes_to_catch_up(self, primary, timeout=10): """ Wait for all nodes to have joined the network and globally replicated all transactions globally executed on the primary (including transactions which added the nodes). """ end_time = time.time() + timeout while time.time() < end_time: with primary.client() as c: resp = c.get("/node/commit") body = resp.body.json() tx_id = TxID.from_str(body["transaction_id"]) if tx_id.valid(): break time.sleep(0.1) assert ( tx_id.valid() ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})" caught_up_nodes = [] while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: resp = c.get(f"/node/local_tx?transaction_id={tx_id}") if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: raise RuntimeError( f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_commit_proof(self, node, seqno, timeout=3): # Wait that the target seqno has a commit proof on a specific node. # This is achieved by first waiting for a commit over seqno, issuing # a write request and then waiting for a commit over that end_time = time.time() + timeout while time.time() < end_time: with node.client( self.consortium.get_any_active_member().local_id) as c: r = c.get("/node/commit") current_tx = TxID.from_str(r.body.json()["transaction_id"]) if current_tx.seqno >= seqno: # Using update_state_digest here as a convenient write tx # that is app agnostic r = c.post("/gov/ack/update_state_digest") assert (r.status_code == http.HTTPStatus.OK.value ), f"Error ack/update_state_digest: {r}" c.wait_for_commit(r) return True time.sleep(0.1) raise TimeoutError( f"seqno {seqno} did not have commit proof after {timeout}s")
def test_isolate_and_reconnect_primary(network, args): primary, backups = network.find_nodes() with network.partitioner.partition(backups): new_primary, _ = network.wait_for_new_primary( primary, nodes=backups, timeout_multiplier=6 ) new_tx = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: r = c.get("/node/commit") timeout = 5 end_time = time.time() + timeout while time.time() < end_time: current_tx = TxID.from_str( c.get("/node/commit").body.json()["transaction_id"] ) if current_tx.seqno >= new_tx.seqno: return network time.sleep(0.1) assert False, f"Stuck at {r}"
def test_nobuiltins_endpoints(network, args): primary, _ = network.find_primary() with primary.client() as c: r = c.get("/app/commit") assert r.status_code == HTTPStatus.OK body_j = r.body.json() tx_id = TxID.from_str(body_j["transaction_id"]) r = c.get("/app/node_summary") assert r.status_code == HTTPStatus.OK body_j = r.body.json() assert body_j["committed_view"] == tx_id.view assert body_j["committed_seqno"] == tx_id.seqno assert body_j["quote_format"] == "OE_SGX_v1" r = c.get("/app/api") assert r.status_code == HTTPStatus.OK openapi_spec_validator.validate_spec(r.body.json()) r = c.get(f"/app/tx_id?seqno={tx_id.seqno}") assert r.status_code == HTTPStatus.OK body_j = r.body.json() assert body_j["transaction_id"] == f"{tx_id}"
def from_raw(raw): # Raw is the output of curl, which is a full HTTP response. # But in the case of a redirect, it is multiple concatenated responses. # We want the final response, so we keep constructing new responses from this stream until we have reached the end while True: sock = FakeSocket(raw) response = HTTPResponse(sock) response.begin() response_len = sock.file.tell() + response.length raw_len = len(raw) if raw_len == response_len: break raw = raw[response_len:] raw_body = response.read() tx_id = TxID.from_str(response.getheader(CCF_TX_ID_HEADER)) return Response( response.status, body=RawResponseBody(raw_body), seqno=tx_id.seqno, view=tx_id.view, headers=response.headers, )
def test_view_history(network, args): if args.consensus == "bft": # This appears to work in BFT, but it is unacceptably slow: # - Each /tx request is a write, with a non-trivial roundtrip response time # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID, # there are too many IDs to test exhaustively # We could rectify this by making this test non-exhaustive (bisecting for view changes, # sampling within a view), but for now it is exhaustive and Raft-only LOG.warning("Skipping view reconstruction in BFT") return network check = infra.checker.Checker() previous_node = None previous_tx_ids = "" for node in network.get_joined_nodes(): with node.client("user0") as c: r = c.get("/node/commit") check(c) commit_tx_id = TxID.from_str(r.body.json()["transaction_id"]) # Retrieve status for all possible Tx IDs seqno_to_views = {} for seqno in range(1, commit_tx_id.seqno + 1): views = [] for view in range(1, commit_tx_id.view + 1): r = c.get(f"/node/tx?transaction_id={view}.{seqno}", log_capture=[]) check(r) status = TxStatus(r.body.json()["status"]) if status == TxStatus.Committed: views.append(view) seqno_to_views[seqno] = views # Check we have exactly one Tx ID for each seqno txs_ok = True for seqno, views in seqno_to_views.items(): if len(views) != 1: txs_ok = False LOG.error( f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}" ) tx_ids_condensed = ", ".join( " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"]) for seqno, views in seqno_to_views.items() ) if txs_ok: LOG.success( f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}" ) else: LOG.error( f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}" ) raise RuntimeError( f"Node {node.node_id}: Incomplete or inconsistent view history" ) # Compare view history between nodes if previous_tx_ids: # Some nodes may have a slightly longer view history so only compare the common prefix min_tx_ids_len = min(len(previous_tx_ids), len(tx_ids_condensed)) assert ( tx_ids_condensed[:min_tx_ids_len] == previous_tx_ids[:min_tx_ids_len] ), f"Tx IDs don't match between node {node.node_id} and node {previous_node.node_id}: {tx_ids_condensed[:min_tx_ids_len]} and {previous_tx_ids[:min_tx_ids_len]}" previous_tx_ids = tx_ids_condensed previous_node = node return network
def last_verified_txid(self) -> TxID: return TxID(self.last_verified_view, self.last_verified_seqno)
def issue( self, network, number_txs=1, on_backup=False, repeat=False, idx=None, wait_for_sync=True, log_capture=None, ): self.network = network remote_node, _ = network.find_primary(log_capture=log_capture) if on_backup: remote_node = network.find_any_backup() LOG.info( f"Applying {number_txs} logging txs to node {remote_node.local_node_id}" ) with remote_node.client(self.user) as c: check_commit = infra.checker.Checker(c) for _ in range(number_txs): if not repeat and idx is None: self.idx += 1 target_idx = idx if target_idx is None: target_idx = self.idx priv_msg = f"Private message at idx {target_idx} [{len(self.priv[target_idx])}]" rep_priv = c.post( "/app/log/private", { "id": target_idx, "msg": priv_msg, }, headers=self._get_headers_base(), log_capture=log_capture, ) self.priv[target_idx].append({ "msg": priv_msg, "seqno": rep_priv.seqno, "view": rep_priv.view }) pub_msg = ( f"Public message at idx {target_idx} [{len(self.pub[target_idx])}]" ) rep_pub = c.post( "/app/log/public", { "id": target_idx, "msg": pub_msg, }, headers=self._get_headers_base(), log_capture=log_capture, ) self.pub[target_idx].append({ "msg": pub_msg, "seqno": rep_pub.seqno, "view": rep_pub.view }) if number_txs and wait_for_sync: check_commit(rep_pub, result=True) if wait_for_sync: network.wait_for_all_nodes_to_commit( tx_id=TxID(rep_pub.view, rep_pub.seqno))
def issue( self, network, number_txs=1, on_backup=False, repeat=False, idx=None, wait_for_sync=True, log_capture=None, send_private=True, send_public=True, record_claim=False, msg=None, user=None, url_suffix=None, ): self.network = network remote_node, _ = network.find_primary(log_capture=log_capture) if on_backup: remote_node = network.find_any_backup() LOG.info( f"Applying {number_txs} logging txs to node {remote_node.local_node_id}" ) headers = None if not user: headers = self._get_headers_base() with remote_node.client(user or self.user) as c: check_commit = infra.checker.Checker(c) for _ in range(number_txs): if not repeat and idx is None: self.idx += 1 target_idx = idx if target_idx is None: target_idx = self.idx if send_private: if msg: priv_msg = msg else: priv_msg = f"Private message at idx {target_idx} [{len(self.priv[target_idx])}]" args = {"id": target_idx, "msg": priv_msg} if self.scope is not None: args["scope"] = self.scope url = "/app/log/private" if url_suffix: url += "/" + url_suffix if self.scope is not None: url += "?scope=" + self.scope rep_priv = c.post( url, args, headers=headers, log_capture=log_capture, ) assert rep_priv.status_code == http.HTTPStatus.OK, rep_priv self.priv[target_idx].append( { "msg": priv_msg, "seqno": rep_priv.seqno, "view": rep_priv.view, "scope": self.scope, } ) wait_point = rep_priv if send_public: if msg: pub_msg = msg else: pub_msg = f"Public message at idx {target_idx} [{len(self.pub[target_idx])}]" payload = { "id": target_idx, "msg": pub_msg, } url = "/app/log/public" if url_suffix: url += "/" + url_suffix if self.scope is not None: url += "?scope=" + self.scope if record_claim: payload["record_claim"] = True rep_pub = c.post( url, payload, headers=headers, log_capture=log_capture, ) assert rep_pub.status_code == http.HTTPStatus.OK, rep_pub self.pub[target_idx].append( { "msg": pub_msg, "seqno": rep_pub.seqno, "view": rep_pub.view, "scope": self.scope, } ) wait_point = rep_pub if number_txs and wait_for_sync: check_commit(wait_point, result=True) if wait_for_sync: network.wait_for_all_nodes_to_commit( tx_id=TxID(wait_point.view, wait_point.seqno) ) return TxID(wait_point.view, wait_point.seqno)