def join_node(self, node, lib_name, args, target_node=None, timeout=JOIN_TIMEOUT, **kwargs): self._add_node(node, lib_name, args, target_node, **kwargs) primary, _ = self.find_primary() try: self.consortium.wait_for_node_to_exist_in_store( primary, node.node_id, timeout=timeout, node_status=(NodeStatus.PENDING if self.status == ServiceStatus.OPEN else NodeStatus.TRUSTED), ) except TimeoutError as e: LOG.error( f"New pending node {node.node_id} failed to join the network") errors, _ = node.stop() self.nodes.remove(node) if errors: # Throw accurate exceptions if known errors found in for error in errors: if "Quote does not contain known enclave measurement" in error: raise CodeIdNotFound from e if "StartupSnapshotIsOld" in error: raise StartupSnapshotIsOld from e raise
def test_recover_service(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) # Start health watcher and stop nodes one by one until a recovery has to be staged watcher = infra.health_watcher.NetworkHealthWatcher(network, args, verbose=True) watcher.start() for node in network.get_joined_nodes(): time.sleep(args.election_timeout_ms / 1000) node.stop() watcher.wait_for_recovery() # Stop remaining nodes network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def stop_all_nodes(self): fatal_error_found = False for node in self.nodes: _, fatal_errors = node.stop() if fatal_errors: fatal_error_found = True LOG.info("All nodes stopped...") if fatal_error_found: if self.ignoring_shutdown_errors: LOG.warning("Ignoring shutdown errors") else: raise NodeShutdownError("Fatal error found during node shutdown")
def trust_node(self, node, args, valid_from=None, validity_period_days=None, no_wait=False): primary, _ = self.find_primary() try: if self.status is ServiceStatus.OPEN: valid_from = valid_from or str( infra.crypto.datetime_to_X509time(datetime.now())) self.consortium.trust_node( primary, node.node_id, valid_from=valid_from, validity_period_days=validity_period_days, timeout=ceil(args.join_timer * 2 / 1000), ) if not no_wait: # Here, quote verification has already been run when the node # was added as pending. Only wait for the join timer for the # joining node to retrieve network secrets. node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 / 1000)) except (ValueError, TimeoutError): LOG.error( f"New trusted node {node.node_id} failed to join the network") node.stop() raise node.network_state = infra.node.NodeNetworkState.joined node.set_certificate_validity_period( valid_from, validity_period_days or args.max_allowed_node_cert_validity_days) if not no_wait: self.wait_for_all_nodes_to_commit(primary=primary)
def stop_all_nodes(self): # Verify that all txs committed on the service can be read if self.txs is not None: self.txs.verify(self) fatal_error_found = False longest_ledger_seqno = 0 most_up_to_date_node = None committed_ledger_dirs = {} for node in self.nodes: _, fatal_errors = node.stop() if fatal_errors: fatal_error_found = True # Find stopped node with longest ledger _, committed_ledger_dir = node.get_ledger( include_read_only_dirs=True) ledger_end_seqno = 0 for ledger_file in os.listdir(committed_ledger_dir): end_seqno = infra.node.get_committed_ledger_end_seqno( ledger_file) if end_seqno > ledger_end_seqno: ledger_end_seqno = end_seqno if ledger_end_seqno > longest_ledger_seqno: longest_ledger_seqno = ledger_end_seqno most_up_to_date_node = node committed_ledger_dirs[node.node_id] = [ committed_ledger_dir, ledger_end_seqno, ] LOG.info("All nodes stopped") # Verify that all ledger files on stopped nodes exist on most up-to-date node # and are identical if most_up_to_date_node: longest_ledger_dir, _ = committed_ledger_dirs[ most_up_to_date_node.node_id] for node_id, (committed_ledger_dir, _) in (l for l in committed_ledger_dirs.items() if not l[0] == most_up_to_date_node.node_id): for ledger_file in os.listdir(committed_ledger_dir): if ledger_file not in os.listdir(longest_ledger_dir): raise Exception( f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.node_id}: {ledger_file}" ) if infra.path.compute_file_checksum( os.path.join(longest_ledger_dir, ledger_file) ) != infra.path.compute_file_checksum( os.path.join(committed_ledger_dir, ledger_file)): raise Exception( f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}" ) LOG.success( f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes" ) if fatal_error_found: if self.ignoring_shutdown_errors: LOG.warning("Ignoring shutdown errors") else: raise NodeShutdownError( "Fatal error found during node shutdown")
def stop_all_nodes(self): for node in self.nodes: node.stop() LOG.info("All remotes stopped...")