def wait_for_new_primary(self, old_primary, nodes=None, timeout_multiplier=2): # We arbitrarily pick twice the election duration to protect ourselves against the somewhat # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this) timeout = self.election_duration * timeout_multiplier LOG.info( f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..." ) end_time = time.time() + timeout error = TimeoutError logs = [] while time.time() < end_time: try: logs = [] new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs) if new_primary.node_id != old_primary.node_id: flush_info(logs, None) LOG.info( f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}" ) return (new_primary, new_term) except PrimaryNotFound: error = PrimaryNotFound except Exception: pass time.sleep(0.1) flush_info(logs, None) raise error(f"A new primary was not elected after {timeout} seconds")
def wait_for_new_primary_in( self, expected_node_ids, nodes=None, timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER, ): # We arbitrarily pick twice the election duration to protect ourselves against the somewhat # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this) timeout = self.observed_election_duration * timeout_multiplier LOG.info( f"Waiting up to {timeout}s for a new primary in {expected_node_ids} to be elected..." ) start_time = time.time() end_time = start_time + timeout error = TimeoutError logs = [] while time.time() < end_time: try: logs = [] new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs) if new_primary.node_id in expected_node_ids: flush_info(logs, None) delay = time.time() - start_time LOG.info( f"New primary after {delay}s is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}" ) return (new_primary, new_term) except PrimaryNotFound: error = PrimaryNotFound except Exception: pass time.sleep(0.1) flush_info(logs, None) raise error(f"A new primary was not elected after {timeout} seconds")
def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10): """ Wait for all nodes to have joined the network and committed all transactions executed on the primary. """ if not (primary or tx_id): raise ValueError("Either a valid TxID or primary node should be specified") end_time = time.time() + timeout # If no TxID is specified, retrieve latest readable one if tx_id == None: while time.time() < end_time: with primary.client() as c: resp = c.get( "/node/network/nodes/self" ) # Well-known read-only endpoint tx_id = TxID(resp.view, resp.seqno) if tx_id.valid(): break time.sleep(0.1) assert ( tx_id.valid() ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})" caught_up_nodes = [] logs = {} while time.time() < end_time: caught_up_nodes = [] for node in self.get_joined_nodes(): with node.client() as c: logs[node.node_id] = [] resp = c.get( f"/node/local_tx?transaction_id={tx_id}", log_capture=logs[node.node_id], ) if resp.status_code != 200: # Node may not have joined the network yet, try again break status = TxStatus(resp.body.json()["status"]) if status == TxStatus.Committed: caught_up_nodes.append(node) elif status == TxStatus.Invalid: flush_info(logs[node.node_id], None, 0) raise RuntimeError( f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed" ) else: pass if len(caught_up_nodes) == len(self.get_joined_nodes()): break time.sleep(0.1) for lines in logs.values(): flush_info(lines, None, 0) assert len(caught_up_nodes) == len( self.get_joined_nodes() ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
def wait_for_new_primary(self, old_primary, nodes=None, timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER): # We arbitrarily pick twice the election duration to protect ourselves against the somewhat # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this) timeout = self.observed_election_duration * timeout_multiplier LOG.info( f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..." ) start_time = time.time() end_time = start_time + timeout error = TimeoutError logs = [] backup = self.find_any_backup(old_primary) if backup.get_consensus() == "bft": try: with backup.client("user0") as c: _ = c.post( "/app/log/private", { "id": -1, "msg": "This is submitted to force a view change" }, ) time.sleep(1) except CCFConnectionException: LOG.warning( f"Could not successfully connect to node {backup.node_id}." ) while time.time() < end_time: try: logs = [] new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs) if new_primary.node_id != old_primary.node_id: flush_info(logs, None) delay = time.time() - start_time LOG.info( f"New primary after {delay}s is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}" ) return (new_primary, new_term) except PrimaryNotFound: error = PrimaryNotFound except Exception: pass time.sleep(0.1) flush_info(logs, None) raise error(f"A new primary was not elected after {timeout} seconds")
def find_primary(self, nodes=None, timeout=3, log_capture=None): """ Find the identity of the primary in the network and return its identity and the current view. """ primary_id = None view = None logs = [] asked_nodes = nodes or self.get_joined_nodes() end_time = time.time() + timeout while time.time() < end_time: for node in asked_nodes: with node.client() as c: try: logs = [] res = c.get("/node/network", timeout=1, log_capture=logs) assert res.status_code == http.HTTPStatus.OK.value, res body = res.body.json() view = body["current_view"] primary_id = body["primary_id"] if primary_id is not None: break except Exception: LOG.warning( f"Could not successfully connect to node {node.local_node_id}. Retrying..." ) if primary_id is not None: break time.sleep(0.1) if primary_id is None: flush_info(logs, log_capture, 0) raise PrimaryNotFound flush_info(logs, log_capture, 0) return (self._get_node_by_service_id(primary_id), view)
def wait_for_primary_unanimity( self, timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER, min_view=None): timeout = self.observed_election_duration * timeout_multiplier LOG.info( f"Waiting up to {timeout}s for all nodes to agree on the primary") start_time = time.time() end_time = start_time + timeout primaries = [] while time.time() < end_time: primaries = [] for node in self.get_joined_nodes(): logs = [] try: primary, view = self.find_primary(nodes=[node], log_capture=logs) if min_view is None or view > min_view: primaries.append(primary) except PrimaryNotFound: pass # Stop checking once all primaries are the same if (len(self.get_joined_nodes()) == len(primaries) and len(set(primaries)) <= 1): break time.sleep(0.1) all_good = (len(self.get_joined_nodes()) == len(primaries) and len(set(primaries)) <= 1) if not all_good: flush_info(logs) for node in self.get_joined_nodes(): with node.client() as c: r = c.get("/node/consensus") pprint.pprint(r.body.json()) assert all_good, f"Multiple primaries: {primaries}" delay = time.time() - start_time LOG.info( f"Primary unanimity after {delay}s: {primaries[0].local_node_id} ({primaries[0].node_id})" ) return primaries[0]
def find_primary(self, timeout=3, log_capture=None): """ Find the identity of the primary in the network and return its identity and the current view. """ primary_id = None view = None logs = [] end_time = time.time() + timeout while time.time() < end_time: for node in self.get_joined_nodes(): with node.client() as c: try: logs = [] res = c.get("/node/network", log_capture=logs) assert res.status_code == 200, res body = res.body.json() primary_id = body["primary_id"] view = body["current_view"] view_change_in_progress = body[ "view_change_in_progress"] if primary_id is not None: break except CCFConnectionException: LOG.warning( f"Could not successfully connect to node {node.node_id}. Retrying..." ) if primary_id is not None: break time.sleep(0.1) if primary_id is None or view_change_in_progress: flush_info(logs, log_capture, 0) raise PrimaryNotFound flush_info(logs, log_capture, 0) return (self._get_node_by_id(primary_id), view)
def stop_all_nodes(self, skip_verification=False, verbose_verification=False): if not skip_verification: if self.txs is not None: LOG.info( "Verifying that all committed txs can be read before shutdown" ) log_capture = [] self.txs.verify(self, log_capture=log_capture) if verbose_verification: flush_info(log_capture, None) fatal_error_found = False for node in self.nodes: _, fatal_errors = node.stop() if fatal_errors: fatal_error_found = True LOG.info("All nodes stopped") if not skip_verification: longest_ledger_seqno = 0 most_up_to_date_node = None committed_ledger_dirs = {} for node in self.nodes: # Find stopped node with longest ledger _, committed_ledger_dir = node.get_ledger( include_read_only_dirs=True) ledger_end_seqno = 0 for ledger_file in os.listdir(committed_ledger_dir): end_seqno = infra.node.get_committed_ledger_end_seqno( ledger_file) if end_seqno > ledger_end_seqno: ledger_end_seqno = end_seqno if ledger_end_seqno > longest_ledger_seqno: longest_ledger_seqno = ledger_end_seqno most_up_to_date_node = node committed_ledger_dirs[node.local_node_id] = [ committed_ledger_dir, ledger_end_seqno, ] # Verify that all ledger files on stopped nodes exist on most up-to-date node # and are identical if most_up_to_date_node: longest_ledger_dir, _ = committed_ledger_dirs[ most_up_to_date_node.local_node_id] for node_id, (committed_ledger_dir, _) in ( l for l in committed_ledger_dirs.items() if not l[0] == most_up_to_date_node.node_id): for ledger_file in os.listdir(committed_ledger_dir): if ledger_file not in os.listdir(longest_ledger_dir): raise Exception( f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.local_node_id}: {ledger_file}" ) if infra.path.compute_file_checksum( os.path.join(longest_ledger_dir, ledger_file) ) != infra.path.compute_file_checksum( os.path.join(committed_ledger_dir, ledger_file)): raise Exception( f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}" ) LOG.success( f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes" ) if fatal_error_found: if self.ignoring_shutdown_errors: LOG.warning("Ignoring shutdown errors") else: raise NodeShutdownError( "Fatal error found during node shutdown")