def find_primary(self, timeout=3): """ Find the identity of the primary in the network and return its identity and the current view. """ primary_id = None view = None end_time = time.time() + timeout while time.time() < end_time: for node in self.get_joined_nodes(): with node.client() as c: try: res = c.get("/node/primary_info") if res.status_code == 200: primary_id = res.body["primary_id"] view = res.body["current_view"] break else: assert "Primary unknown" in res.body, res LOG.warning("Primary unknown. Retrying...") except CCFConnectionException: LOG.warning( f"Could not successful connect to node {node.node_id}. Retrying..." ) if primary_id is not None: break time.sleep(0.1) if primary_id is None: raise PrimaryNotFound return (self._get_node_by_id(primary_id), view)
def find_recovery_tx_seqno(node): min_recovery_seqno = 0 with node.client() as c: r = c.get("/node/state").body.json() if "last_recovered_seqno" not in r: return None min_recovery_seqno = r["last_recovered_seqno"] ledger = ccf.ledger.Ledger(node.remote.ledger_paths(), committed_only=False) for chunk in ledger: _, chunk_end_seqno = chunk.get_seqnos() if chunk_end_seqno < min_recovery_seqno: continue for tx in chunk: tables = tx.get_public_domain().get_tables() seqno = tx.get_public_domain().get_seqno() if ccf.ledger.SERVICE_INFO_TABLE_NAME in tables: service_status = json.loads( tables[ccf.ledger.SERVICE_INFO_TABLE_NAME][ ccf.ledger.WELL_KNOWN_SINGLETON_TABLE_KEY ] )["status"] if service_status == "Open": return seqno return None
def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10): """ Wait for all nodes to have joined the network and committed all transactions executed on the primary. """ if not (primary or tx_id): raise ValueError("Either a valid TxID or primary node should be specified") end_time = time.time() + timeout # If no TxID is specified, retrieve latest readable one if tx_id == None: while time.time() < end_time: with primary.client() as c: resp = c.get( "/node/network/nodes/self" ) # Well-known read-only endpoint tx_id = TxID(resp.view, resp.seqno) if tx_id.valid(): break time.sleep(0.1) assert ( tx_id.valid() ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})" caught_up_nodes = [] logs = {} while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: logs[node.node_id] = [] resp = c.get( f"/node/local_tx?transaction_id={tx_id}", log_capture=logs[node.node_id], ) if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: flush_info(logs[node.node_id], None, 0) raise RuntimeError( f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) for lines in logs.values(): flush_info(lines, None, 0) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def _wait_for_app_open(self, node, timeout=3): end_time = time.time() + timeout while time.time() < end_time: # As an operator, query a well-known /app endpoint to find out # if the app has been opened to users with node.client() as c: r = c.get("/app/commit") if not (r.status_code == http.HTTPStatus.NOT_FOUND.value): return time.sleep(0.1) raise TimeoutError(f"Application frontend was not open after {timeout}s")
def wait_for_commit_proof(self, node, seqno, timeout=3): # Wait that the target seqno has a commit proof on a specific node. # This is achieved by first waiting for a commit over seqno, issuing # a write request and then waiting for a commit over that end_time = time.time() + timeout while time.time() < end_time: with node.client() as c: r = c.get("/node/commit") current_tx = TxID.from_str(r.body.json()["transaction_id"]) if current_tx.seqno >= seqno: with node.client(self.consortium.get_any_active_member(). local_id) as nc: # Using update_state_digest here as a convenient write tx # that is app agnostic r = nc.post("/gov/ack/update_state_digest") assert (r.status_code == http.HTTPStatus.OK.value ), f"Error ack/update_state_digest: {r}" c.wait_for_commit(r) return True time.sleep(0.1) raise TimeoutError( f"seqno {seqno} did not have commit proof after {timeout}s")
def wait_for_node_commit_sync(self, timeout=3): """ Wait for commit level to get in sync on all nodes. This is expected to happen once CFTR has been established, in the absence of new transactions. """ end_time = time.time() + timeout while time.time() < end_time: commits = [] for node in self.get_joined_nodes(): with node.client() as c: r = c.get("/node/commit") assert r.status_code == http.HTTPStatus.OK.value body = r.body.json() commits.append(body["transaction_id"]) if [commits[0]] * len(commits) == commits: break time.sleep(0.1) expected = [commits[0]] * len(commits) if expected != commits: for node in self.get_joined_nodes(): with node.client() as c: r = c.get("/node/consensus") pprint.pprint(r.body.json()) assert expected == commits, f"Multiple commit values: {commits}"
def wait_for_state(self, node, state, timeout=3): end_time = time.time() + timeout while time.time() < end_time: try: with node.client(connection_timeout=timeout) as c: r = c.get("/node/state") if r.body.json()["state"] == state: break except ConnectionRefusedError: pass time.sleep(0.1) else: raise TimeoutError( f"Timed out waiting for state {state} on node {node.node_id}") if state == "partOfNetwork": self.status = ServiceStatus.OPEN
def wait_for_all_nodes_to_catch_up(self, primary, timeout=10): """ Wait for all nodes to have joined the network and globally replicated all transactions globally executed on the primary (including transactions which added the nodes). """ end_time = time.time() + timeout while time.time() < end_time: with primary.client() as c: resp = c.get("/node/commit") body = resp.body.json() seqno = body["seqno"] view = body["view"] if seqno != 0: break time.sleep(0.1) assert ( seqno != 0 ), f"Primary {primary.node_id} has not made any progress yet (view: {view}, seqno: {seqno})" caught_up_nodes = [] while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: c.get("/node/commit") resp = c.get(f"/node/local_tx?view={view}&seqno={seqno}") if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: raise RuntimeError( f"Node {node.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_node_commit_sync(self, timeout=3): """ Wait for commit level to get in sync on all nodes. This is expected to happen once CFTR has been established, in the absence of new transactions. """ end_time = time.time() + timeout while time.time() < end_time: commits = [] for node in self.get_joined_nodes(): with node.client() as c: r = c.get("/node/commit") commits.append(f"{r.view}.{r.seqno}") if [commits[0]] * len(commits) == commits: break time.sleep(0.1) expected = [commits[0]] * len(commits) assert expected == commits, f"{commits} != {expected}"
def find_primary(self, nodes=None, timeout=3, log_capture=None): """ Find the identity of the primary in the network and return its identity and the current view. """ primary_id = None view = None logs = [] asked_nodes = nodes or self.get_joined_nodes() end_time = time.time() + timeout while time.time() < end_time: for node in asked_nodes: with node.client() as c: try: logs = [] res = c.get("/node/network", timeout=1, log_capture=logs) assert res.status_code == http.HTTPStatus.OK.value, res body = res.body.json() view = body["current_view"] primary_id = body["primary_id"] if primary_id is not None: break except Exception: LOG.warning( f"Could not successfully connect to node {node.local_node_id}. Retrying..." ) if primary_id is not None: break time.sleep(0.1) if primary_id is None: flush_info(logs, log_capture, 0) raise PrimaryNotFound flush_info(logs, log_capture, 0) return (self._get_node_by_service_id(primary_id), view)
def wait_for_primary_unanimity( self, timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER, min_view=None): timeout = self.observed_election_duration * timeout_multiplier LOG.info( f"Waiting up to {timeout}s for all nodes to agree on the primary") start_time = time.time() end_time = start_time + timeout primaries = [] while time.time() < end_time: primaries = [] for node in self.get_joined_nodes(): logs = [] try: primary, view = self.find_primary(nodes=[node], log_capture=logs) if min_view is None or view > min_view: primaries.append(primary) except PrimaryNotFound: pass # Stop checking once all primaries are the same if (len(self.get_joined_nodes()) == len(primaries) and len(set(primaries)) <= 1): break time.sleep(0.1) all_good = (len(self.get_joined_nodes()) == len(primaries) and len(set(primaries)) <= 1) if not all_good: flush_info(logs) for node in self.get_joined_nodes(): with node.client() as c: r = c.get("/node/consensus") pprint.pprint(r.body.json()) assert all_good, f"Multiple primaries: {primaries}" delay = time.time() - start_time LOG.info( f"Primary unanimity after {delay}s: {primaries[0].local_node_id} ({primaries[0].node_id})" ) return primaries[0]
def find_primary(self, timeout=3, log_capture=None): """ Find the identity of the primary in the network and return its identity and the current view. """ primary_id = None view = None logs = [] end_time = time.time() + timeout while time.time() < end_time: for node in self.get_joined_nodes(): with node.client() as c: try: logs = [] res = c.get("/node/network", log_capture=logs) assert res.status_code == 200, res body = res.body.json() primary_id = body["primary_id"] view = body["current_view"] view_change_in_progress = body[ "view_change_in_progress"] if primary_id is not None: break except CCFConnectionException: LOG.warning( f"Could not successfully connect to node {node.node_id}. Retrying..." ) if primary_id is not None: break time.sleep(0.1) if primary_id is None or view_change_in_progress: flush_info(logs, log_capture, 0) raise PrimaryNotFound flush_info(logs, log_capture, 0) return (self._get_node_by_id(primary_id), view)