def test_all_members(network, args): def run_test_all_members(network): primary, _ = network.find_primary() with primary.client() as c: r = c.get("/gov/members") assert r.status_code == http.HTTPStatus.OK.value response_members = r.body.json() network_members = network.get_members() assert len(network_members) == len(response_members) for member in network_members: assert member.service_id in response_members response_details = response_members[member.service_id] assert response_details["cert"] == member.cert assert ( infra.member.MemberStatus(response_details["status"]) == member.status_code ) assert response_details["member_data"] == member.member_data if member.is_recovery_member: recovery_enc_key = open( member.member_info["encryption_public_key_file"], encoding="utf-8" ).read() assert response_details["public_encryption_key"] == recovery_enc_key else: assert response_details["public_encryption_key"] is None # Test on current network run_test_all_members(network) # Test on mid-recovery network network.save_service_identity(args) primary, _ = network.find_primary() network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = primary.get_ledger() # NB: Don't try to get snapshots, since there may not be any committed, # and we cannot wait for commit now that the node is stopped recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, ) run_test_all_members(recovered_network) recovered_network.recover(args) return recovered_network
def test_recover_service(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) # Start health watcher and stop nodes one by one until a recovery has to be staged watcher = infra.health_watcher.NetworkHealthWatcher(network, args, verbose=True) watcher.start() for node in network.get_joined_nodes(): time.sleep(args.election_timeout_ms / 1000) node.stop() watcher.wait_for_recovery() # Stop remaining nodes network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot): """ Tests that a service from the very first LTS can be recovered to the next LTS, and so forth, until the version of the local checkout. The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the entire historical ledger is used. """ LOG.info("Use snapshot: {}", use_snapshot) repo = infra.github.Repository() lts_releases = repo.get_lts_releases(local_branch) has_pre_2_rc7_ledger = False LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}") lts_versions = [] # Add an empty entry to release to indicate local checkout # Note: dicts are ordered from Python3.7 lts_releases[None] = None jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) for idx, (_, lts_release) in enumerate(lts_releases.items()): if lts_release: version, install_path = repo.install_release(lts_release) lts_versions.append(version) set_js_args(args, install_path) else: version = args.ccf_version install_path = LOCAL_CHECKOUT_DIRECTORY get_new_constitution_for_install(args, install_path) binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path( install_path) if not args.dry_run: network_args = { "hosts": args.nodes, "binary_dir": binary_dir, "library_dir": library_dir, "txs": txs, "jwt_issuer": jwt_issuer, "version": version, } if idx == 0: LOG.info(f"Starting new service (version: {version})") network = infra.network.Network(**network_args) network.start_and_open(args) else: LOG.info(f"Recovering service (new version: {version})") network = infra.network.Network(**network_args, existing_network=network) network.start_in_recovery( args, ledger_dir, committed_ledger_dirs, snapshots_dir=snapshots_dir, ) network.recover(args) nodes = network.get_joined_nodes() primary, _ = network.find_primary() # Verify that all nodes run the expected CCF version for node in nodes: # Note: /node/version endpoint and custom certificate validity # were added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( r.body.json()["ccf_version"] == expected_version ), f"Node version is not {expected_version}" node.verify_certificate_validity_period() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) if idx > 0: test_new_service( network, args, install_path, binary_dir, library_dir, version, ) # We accept ledger chunk file differences during upgrades # from 1.x to 2.x post rc7 ledger. This is necessary because # the ledger files may not be chunked at the same interval # between those versions (see https://github.com/microsoft/ccf/issues/3613; # 1.x ledgers do not contain the header flags to synchronize ledger chunks). # This can go once 2.0 is released. current_version_past_2_rc7 = primary.version_after( "ccf-2.0.0-rc7") has_pre_2_rc7_ledger = (not current_version_past_2_rc7 or has_pre_2_rc7_ledger) is_ledger_chunk_breaking = (has_pre_2_rc7_ledger and current_version_past_2_rc7) snapshots_dir = (network.get_committed_snapshots(primary) if use_snapshot else None) network.stop_all_nodes( skip_verification=True, accept_ledger_diff=is_ledger_chunk_breaking, ) ledger_dir, committed_ledger_dirs = primary.get_ledger() network.save_service_identity(args) # Check that ledger and snapshots can be parsed ccf.ledger.Ledger( committed_ledger_dirs).get_latest_public_state() if snapshots_dir: for s in os.listdir(snapshots_dir): with ccf.ledger.Snapshot(os.path.join( snapshots_dir, s)) as snapshot: snapshot.get_public_domain() return lts_versions
def test_recover_service_with_wrong_identity(network, args): old_primary, _ = network.find_primary() snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() network.save_service_identity(args) first_service_identity_file = args.previous_service_identity_file current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() # Attempt a recovery with the wrong previous service certificate args.previous_service_identity_file = network.consortium.user_cert_path("user0") broken_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) exception = None try: broken_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) except Exception as ex: exception = ex broken_network.ignoring_shutdown_errors = True broken_network.stop_all_nodes(skip_verification=True) if exception is None: raise ValueError("Recovery should have failed") if not broken_network.nodes[0].check_log_for_error_message( "Error starting node: Previous service identity does not endorse the node identity that signed the snapshot" ): raise ValueError("Node log does not contain the expect error message") # Recover, now with the right service identity args.previous_service_identity_file = first_service_identity_file recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def test_recover_service_truncated_ledger( network, args, corrupt_first_tx=False, corrupt_last_tx=False, corrupt_first_sig=False, ): network.save_service_identity(args) old_primary, _ = network.find_primary() LOG.info("Force new ledger chunk for app txs to be in committed chunks") network.consortium.force_ledger_chunk(old_primary) LOG.info( "Fill ledger with dummy entries until at least one ledger chunk is not committed, and contains a signature" ) current_ledger_path = old_primary.remote.ledger_paths()[0] while True: network.consortium.create_and_withdraw_large_proposal( old_primary, wait_for_commit=True ) # A signature will have been emitted by now (wait_for_commit) network.consortium.create_and_withdraw_large_proposal(old_primary) if not all( f.endswith(ccf.ledger.COMMITTED_FILE_SUFFIX) for f in os.listdir(current_ledger_path) ): break network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() # Corrupt _uncommitted_ ledger before starting new service ledger = ccf.ledger.Ledger([current_ledger_dir], committed_only=False) def get_middle_tx_offset(tx): offset, next_offset = tx.get_offsets() return offset + (next_offset - offset) // 2 for chunk in ledger: chunk_filename = chunk.filename() first_tx_offset = None last_tx_offset = None first_sig_offset = None for tx in chunk: tables = tx.get_public_domain().get_tables() if ( first_sig_offset is None and ccf.ledger.SIGNATURE_TX_TABLE_NAME in tables ): first_sig_offset = get_middle_tx_offset(tx) last_tx_offset = get_middle_tx_offset(tx) if first_tx_offset is None: first_tx_offset = get_middle_tx_offset(tx) truncated_ledger_file_path = os.path.join(current_ledger_dir, chunk_filename) if corrupt_first_tx: truncate_offset = first_tx_offset elif corrupt_last_tx: truncate_offset = last_tx_offset elif corrupt_first_sig: truncate_offset = first_sig_offset with open(truncated_ledger_file_path, "r+", encoding="utf-8") as f: f.truncate(truncate_offset) LOG.warning( f"Truncated ledger file {truncated_ledger_file_path} at {truncate_offset}" ) recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, ) recovered_network.recover(args) return recovered_network
def test_share_resilience(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.transition_service_to_open( primary, previous_service_identity=slurp_file(args.previous_service_identity_file), ) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. encrypted_submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if ( encrypted_submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1 ): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) encrypted_submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary) last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, infra.node.State.PART_OF_NETWORK.value, timeout=args.ledger_recovery_timeout, ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) if recovered_network.service_load: recovered_network.service_load.set_network(recovered_network) return recovered_network
def test_recover_service_aborted(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() aborted_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) aborted_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) LOG.info("Fill in ledger to trigger new chunks, which should be marked as recovery") primary, _ = aborted_network.find_primary() while ( len( [ f for f in os.listdir(primary.remote.ledger_paths()[0]) if f.endswith( f"{ccf.ledger.COMMITTED_FILE_SUFFIX}{ccf.ledger.RECOVERY_FILE_SUFFIX}" ) ] ) < 2 ): # Submit large proposal until at least two recovery ledger chunks are committed aborted_network.consortium.create_and_withdraw_large_proposal(primary) LOG.info( "Do not complete service recovery on purpose and initiate new recovery from scratch" ) snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(primary) # Check that all nodes have the same (recovery) ledger files aborted_network.stop_all_nodes( skip_verification=True, read_recovery_ledger_files=True ) current_ledger_dir, committed_ledger_dirs = primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=aborted_network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network