def run_corrupted_ledger(args): txs = app.LoggingTxs("user0") with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_open(args) network = test_recover_service_truncated_ledger( network, args, corrupt_first_tx=True ) network = test_recover_service_truncated_ledger( network, args, corrupt_last_tx=True ) network = test_recover_service_truncated_ledger( network, args, corrupt_first_sig=True ) network.stop_all_nodes() # Make sure ledger can be read once recovered (i.e. ledger corruption does not affect recovered ledger) for node in network.nodes: ledger = ccf.ledger.Ledger(node.remote.ledger_paths(), committed_only=False) _, last_seqno = ledger.get_latest_public_state() LOG.info( f"Successfully read ledger for node {node.local_node_id} up to seqno {last_seqno}" )
def run(args): txs = app.LoggingTxs() with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_join(args) for i in range(args.recovery): # Alternate between recovery with primary change and stable primary-ship, # with and without snapshots if i % 2 == 0: recovered_network = test_share_resilience( network, args, from_snapshot=True ) else: recovered_network = test(network, args, from_snapshot=False) network.stop_all_nodes() network = recovered_network LOG.success("Recovery complete on all nodes")
def test_share_resilience(network, args, from_snapshot=False): old_primary, _ = network.find_primary() snapshot_dir = None if from_snapshot: snapshot_dir = network.get_committed_snapshots(old_primary) current_ledger_dir, committed_ledger_dir = old_primary.get_ledger( include_read_only_dirs=True) network.stop_all_nodes() recovered_network = infra.network.Network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dir=committed_ledger_dir, snapshot_dir=snapshot_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.transition_service_to_open(primary) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if (submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary.node_id) assert (new_primary is not primary ), f"Primary {primary.node_id} should have changed after election" last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, infra.node.State.PART_OF_NETWORK.value, timeout=args.ledger_recovery_timeout, ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) return recovered_network
def test_all_members(network, args): def run_test_all_members(network): primary, _ = network.find_primary() with primary.client() as c: r = c.get("/gov/members") assert r.status_code == http.HTTPStatus.OK.value response_members = r.body.json() network_members = network.get_members() assert len(network_members) == len(response_members) for member in network_members: assert member.service_id in response_members response_details = response_members[member.service_id] assert response_details["cert"] == member.cert assert ( infra.member.MemberStatus(response_details["status"]) == member.status_code ) assert response_details["member_data"] == member.member_data if member.is_recovery_member: recovery_enc_key = open( member.member_info["encryption_public_key_file"], encoding="utf-8" ).read() assert response_details["public_encryption_key"] == recovery_enc_key else: assert response_details["public_encryption_key"] is None # Test on current network run_test_all_members(network) # Test on mid-recovery network network.save_service_identity(args) primary, _ = network.find_primary() network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = primary.get_ledger() # NB: Don't try to get snapshots, since there may not be any committed, # and we cannot wait for commit now that the node is stopped recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, ) run_test_all_members(recovered_network) recovered_network.recover(args) return recovered_network
def test_recover_service(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) # Start health watcher and stop nodes one by one until a recovery has to be staged watcher = infra.health_watcher.NetworkHealthWatcher(network, args, verbose=True) watcher.start() for node in network.get_joined_nodes(): time.sleep(args.election_timeout_ms / 1000) node.stop() watcher.wait_for_recovery() # Stop remaining nodes network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def test(network, args, from_snapshot=False): old_primary, _ = network.find_primary() snapshot_dir = None if from_snapshot: snapshot_dir = network.get_committed_snapshots(old_primary) current_ledger_dir, committed_ledger_dir = old_primary.get_ledger( include_read_only_dirs=True) network.stop_all_nodes() recovered_network = infra.network.Network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dir=committed_ledger_dir, snapshot_dir=snapshot_dir, ) recovered_network.recover(args) return recovered_network
def run(args): hosts = ["localhost", "localhost", "localhost"] txs = app.LoggingTxs() with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs) as network: network.start_and_join(args) for i in range(args.recovery): # Alternate between recovery with primary change and stable primary-ship if i % 2 == 0: recovered_network = test_share_resilience(network, args) else: recovered_network = test(network, args) network.stop_all_nodes() network = recovered_network LOG.success("Recovery complete on all nodes")
def run_file_operations(args): with tempfile.TemporaryDirectory() as tmp_dir: txs = app.LoggingTxs("user0") with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: args.common_read_only_ledger_dir = tmp_dir network.start_and_open(args) test_save_committed_ledger_files(network, args) test_parse_snapshot_file(network, args) test_forced_ledger_chunk(network, args) test_forced_snapshot(network, args) primary, _ = network.find_primary() network.stop_all_nodes() test_split_ledger_on_stopped_network(primary, args)
def run(args): chosen_suite = [] if not args.test_suite: args.test_suite = ["all"] for choice in args.test_suite: try: chosen_suite.extend(s.suites[choice]) except KeyError as e: raise ValueError(f"Unhandled choice: {choice}") from e seed = None if os.getenv("SHUFFLE_SUITE"): seed = os.getenv("SHUFFLE_SUITE_SEED") if seed is None: seed = time.time() seed = int(seed) LOG.success(f"Shuffling full suite with seed {seed}") random.seed(seed) random.shuffle(chosen_suite) s.validate_tests_signature(chosen_suite) if args.enforce_reqs is False: LOG.warning("Test requirements will be ignored") txs = app.LoggingTxs() network = infra.network.Network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, txs=txs) network.start_and_join(args) LOG.info( f"Running {len(chosen_suite)} tests for {args.test_duration} seconds") run_tests = {} success = True elapsed = args.test_duration if args.filter is not None: filter_re = re.compile(args.filter) def filter_fun(x): return filter_re is None or filter_re.match(x[1].__name__) tests_to_run = filter(filter_fun, enumerate(chosen_suite)) else: tests_to_run = enumerate(chosen_suite) for i, test in tests_to_run: status = None reason = None if elapsed <= 0: LOG.warning( f"Test duration time ({args.test_duration} seconds) is up!") break try: LOG.debug(f"Running {s.test_name(test)}...") test_time_before = time.time() # Actually run the test new_network = test(network, args) status = TestStatus.success except reqs.TestRequirementsNotMet as ce: LOG.warning(f"Test requirements for {s.test_name(test)} not met") status = TestStatus.skipped reason = str(ce) new_network = network except Exception: LOG.exception(f"Test {s.test_name(test)} failed") status = TestStatus.failure new_network = network test_elapsed = time.time() - test_time_before # Construct test report run_tests[i] = { "name": s.test_name(test), "status": status.name, "elapsed (s)": round(test_elapsed, 2), "memory": mem_stats(new_network), } if reason is not None: run_tests[i]["reason"] = reason # If the test function did not return a network, it is not possible to continue if new_network is None: raise ValueError( f"Network returned by {s.test_name(test)} is None") # If the network was changed (e.g. recovery test), use the new network from now on if new_network != network: network = new_network LOG.debug(f"Test {s.test_name(test)} took {test_elapsed:.2f} secs") # For now, if a test fails, the entire test suite if stopped if status is TestStatus.failure: success = False break elapsed -= test_elapsed network.stop_all_nodes() if success: LOG.success( f"Full suite passed. Ran {len(run_tests)}/{len(chosen_suite)}") else: LOG.error(f"Suite failed. Ran {len(run_tests)}/{len(chosen_suite)}") if seed: LOG.info(f"Full suite was shuffled with seed: {seed}") for idx, test in run_tests.items(): status = test["status"] if status == TestStatus.success.name: log_fn = LOG.success elif status == TestStatus.skipped.name: log_fn = LOG.warning else: log_fn = LOG.error log_fn(f"Test #{idx}:\n{json.dumps(test, indent=4)}") if not success: sys.exit(1)
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot): """ Tests that a service from the very first LTS can be recovered to the next LTS, and so forth, until the version of the local checkout. The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the entire historical ledger is used. """ LOG.info("Use snapshot: {}", use_snapshot) repo = infra.github.Repository() lts_releases = repo.get_lts_releases(local_branch) has_pre_2_rc7_ledger = False LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}") lts_versions = [] # Add an empty entry to release to indicate local checkout # Note: dicts are ordered from Python3.7 lts_releases[None] = None jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) for idx, (_, lts_release) in enumerate(lts_releases.items()): if lts_release: version, install_path = repo.install_release(lts_release) lts_versions.append(version) set_js_args(args, install_path) else: version = args.ccf_version install_path = LOCAL_CHECKOUT_DIRECTORY get_new_constitution_for_install(args, install_path) binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path( install_path) if not args.dry_run: network_args = { "hosts": args.nodes, "binary_dir": binary_dir, "library_dir": library_dir, "txs": txs, "jwt_issuer": jwt_issuer, "version": version, } if idx == 0: LOG.info(f"Starting new service (version: {version})") network = infra.network.Network(**network_args) network.start_and_open(args) else: LOG.info(f"Recovering service (new version: {version})") network = infra.network.Network(**network_args, existing_network=network) network.start_in_recovery( args, ledger_dir, committed_ledger_dirs, snapshots_dir=snapshots_dir, ) network.recover(args) nodes = network.get_joined_nodes() primary, _ = network.find_primary() # Verify that all nodes run the expected CCF version for node in nodes: # Note: /node/version endpoint and custom certificate validity # were added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( r.body.json()["ccf_version"] == expected_version ), f"Node version is not {expected_version}" node.verify_certificate_validity_period() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) if idx > 0: test_new_service( network, args, install_path, binary_dir, library_dir, version, ) # We accept ledger chunk file differences during upgrades # from 1.x to 2.x post rc7 ledger. This is necessary because # the ledger files may not be chunked at the same interval # between those versions (see https://github.com/microsoft/ccf/issues/3613; # 1.x ledgers do not contain the header flags to synchronize ledger chunks). # This can go once 2.0 is released. current_version_past_2_rc7 = primary.version_after( "ccf-2.0.0-rc7") has_pre_2_rc7_ledger = (not current_version_past_2_rc7 or has_pre_2_rc7_ledger) is_ledger_chunk_breaking = (has_pre_2_rc7_ledger and current_version_past_2_rc7) snapshots_dir = (network.get_committed_snapshots(primary) if use_snapshot else None) network.stop_all_nodes( skip_verification=True, accept_ledger_diff=is_ledger_chunk_breaking, ) ledger_dir, committed_ledger_dirs = primary.get_ledger() network.save_service_identity(args) # Check that ledger and snapshots can be parsed ccf.ledger.Ledger( committed_ledger_dirs).get_latest_public_state() if snapshots_dir: for s in os.listdir(snapshots_dir): with ccf.ledger.Snapshot(os.path.join( snapshots_dir, s)) as snapshot: snapshot.get_public_domain() return lts_versions
def test_recover_service_with_wrong_identity(network, args): old_primary, _ = network.find_primary() snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() network.save_service_identity(args) first_service_identity_file = args.previous_service_identity_file current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() # Attempt a recovery with the wrong previous service certificate args.previous_service_identity_file = network.consortium.user_cert_path("user0") broken_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) exception = None try: broken_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) except Exception as ex: exception = ex broken_network.ignoring_shutdown_errors = True broken_network.stop_all_nodes(skip_verification=True) if exception is None: raise ValueError("Recovery should have failed") if not broken_network.nodes[0].check_log_for_error_message( "Error starting node: Previous service identity does not endorse the node identity that signed the snapshot" ): raise ValueError("Node log does not contain the expect error message") # Recover, now with the right service identity args.previous_service_identity_file = first_service_identity_file recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def run(args): recoveries_count = 5 txs = app.LoggingTxs("user0") with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_open(args) if args.with_load: # See https://github.com/microsoft/CCF/issues/3788 for justification LOG.info("Loading service before recovery...") primary, _ = network.find_primary() with infra.service_load.load() as load: load.begin(network, rate=infra.service_load.DEFAULT_REQUEST_RATE_S * 10) while True: with primary.client() as c: r = c.get("/node/commit", log_capture=[]).body.json() tx_id = ccf.tx_id.TxID.from_str(r["transaction_id"]) if tx_id.seqno > args.sig_tx_interval: LOG.info(f"Loaded service successfully: tx_id, {tx_id}") break time.sleep(0.1) ref_msg = get_and_verify_historical_receipt(network, None) network = test_recover_service_with_wrong_identity(network, args) for i in range(recoveries_count): # Issue transactions which will required historical ledger queries recovery # when the network is shutdown network.txs.issue(network, number_txs=1) network.txs.issue(network, number_txs=1, repeat=True) # Alternate between recovery with primary change and stable primary-ship, # with and without snapshots if i % recoveries_count == 0: if args.consensus != "BFT": network = test_share_resilience(network, args, from_snapshot=True) elif i % recoveries_count == 1: network = test_recover_service_aborted( network, args, from_snapshot=False ) else: network = test_recover_service(network, args, from_snapshot=False) for node in network.get_joined_nodes(): node.verify_certificate_validity_period() check_snapshots(args, network) ref_msg = get_and_verify_historical_receipt(network, ref_msg) LOG.success("Recovery complete on all nodes") primary, _ = network.find_primary() network.stop_all_nodes() # Verify that a new ledger chunk was created at the start of each recovery ledger = ccf.ledger.Ledger( primary.remote.ledger_paths(), committed_only=False, validator=ccf.ledger.LedgerValidator(accept_deprecated_entry_types=False), ) for chunk in ledger: chunk_start_seqno, _ = chunk.get_seqnos() for tx in chunk: tables = tx.get_public_domain().get_tables() seqno = tx.get_public_domain().get_seqno() if ccf.ledger.SERVICE_INFO_TABLE_NAME in tables: service_status = json.loads( tables[ccf.ledger.SERVICE_INFO_TABLE_NAME][ ccf.ledger.WELL_KNOWN_SINGLETON_TABLE_KEY ] )["status"] if service_status == "Opening" or service_status == "Recovering": LOG.info( f"New ledger chunk found for service {service_status.lower()} at {seqno}" ) assert ( chunk_start_seqno == seqno ), f"{service_status} service at seqno {seqno} did not start a new ledger chunk (started at {chunk_start_seqno})" test_recover_service_with_expired_cert(args)
def test_recover_service_truncated_ledger( network, args, corrupt_first_tx=False, corrupt_last_tx=False, corrupt_first_sig=False, ): network.save_service_identity(args) old_primary, _ = network.find_primary() LOG.info("Force new ledger chunk for app txs to be in committed chunks") network.consortium.force_ledger_chunk(old_primary) LOG.info( "Fill ledger with dummy entries until at least one ledger chunk is not committed, and contains a signature" ) current_ledger_path = old_primary.remote.ledger_paths()[0] while True: network.consortium.create_and_withdraw_large_proposal( old_primary, wait_for_commit=True ) # A signature will have been emitted by now (wait_for_commit) network.consortium.create_and_withdraw_large_proposal(old_primary) if not all( f.endswith(ccf.ledger.COMMITTED_FILE_SUFFIX) for f in os.listdir(current_ledger_path) ): break network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() # Corrupt _uncommitted_ ledger before starting new service ledger = ccf.ledger.Ledger([current_ledger_dir], committed_only=False) def get_middle_tx_offset(tx): offset, next_offset = tx.get_offsets() return offset + (next_offset - offset) // 2 for chunk in ledger: chunk_filename = chunk.filename() first_tx_offset = None last_tx_offset = None first_sig_offset = None for tx in chunk: tables = tx.get_public_domain().get_tables() if ( first_sig_offset is None and ccf.ledger.SIGNATURE_TX_TABLE_NAME in tables ): first_sig_offset = get_middle_tx_offset(tx) last_tx_offset = get_middle_tx_offset(tx) if first_tx_offset is None: first_tx_offset = get_middle_tx_offset(tx) truncated_ledger_file_path = os.path.join(current_ledger_dir, chunk_filename) if corrupt_first_tx: truncate_offset = first_tx_offset elif corrupt_last_tx: truncate_offset = last_tx_offset elif corrupt_first_sig: truncate_offset = first_sig_offset with open(truncated_ledger_file_path, "r+", encoding="utf-8") as f: f.truncate(truncate_offset) LOG.warning( f"Truncated ledger file {truncated_ledger_file_path} at {truncate_offset}" ) recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, ) recovered_network.recover(args) return recovered_network
def test_share_resilience(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.transition_service_to_open( primary, previous_service_identity=slurp_file(args.previous_service_identity_file), ) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. encrypted_submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if ( encrypted_submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1 ): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) encrypted_submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary) last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, infra.node.State.PART_OF_NETWORK.value, timeout=args.ledger_recovery_timeout, ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) if recovered_network.service_load: recovered_network.service_load.set_network(recovered_network) return recovered_network
def test_recover_service_aborted(network, args, from_snapshot=False): network.save_service_identity(args) old_primary, _ = network.find_primary() snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(old_primary) network.stop_all_nodes() current_ledger_dir, committed_ledger_dirs = old_primary.get_ledger() aborted_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) aborted_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) LOG.info("Fill in ledger to trigger new chunks, which should be marked as recovery") primary, _ = aborted_network.find_primary() while ( len( [ f for f in os.listdir(primary.remote.ledger_paths()[0]) if f.endswith( f"{ccf.ledger.COMMITTED_FILE_SUFFIX}{ccf.ledger.RECOVERY_FILE_SUFFIX}" ) ] ) < 2 ): # Submit large proposal until at least two recovery ledger chunks are committed aborted_network.consortium.create_and_withdraw_large_proposal(primary) LOG.info( "Do not complete service recovery on purpose and initiate new recovery from scratch" ) snapshots_dir = None if from_snapshot: snapshots_dir = network.get_committed_snapshots(primary) # Check that all nodes have the same (recovery) ledger files aborted_network.stop_all_nodes( skip_verification=True, read_recovery_ledger_files=True ) current_ledger_dir, committed_ledger_dirs = primary.get_ledger() recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, existing_network=aborted_network, ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dirs=committed_ledger_dirs, snapshots_dir=snapshots_dir, ) recovered_network.recover(args) return recovered_network
def run_ledger_compatibility_since_first(args, local_branch, use_snapshot): """ Tests that a service from the very first LTS can be recovered to the next LTS, and so forth, until the version of the local checkout. The recovery process uses snapshot is `use_snapshot` is True. Otherwise, the entire historical ledger is used. """ LOG.info("Use snapshot: {}", use_snapshot) repo = infra.github.Repository() lts_releases = repo.get_lts_releases() LOG.info(f"LTS releases: {[r[1] for r in lts_releases.items()]}") lts_versions = [] # Add an empty entry to release to indicate local checkout # Note: dicts are ordered from Python3.7 lts_releases[None] = None jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s ) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) for idx, (_, lts_release) in enumerate(lts_releases.items()): if lts_release: version, install_path = repo.install_release(lts_release) lts_versions.append(version) set_js_args(args, install_path) else: version = args.ccf_version install_path = LOCAL_CHECKOUT_DIRECTORY binary_dir, library_dir = get_bin_and_lib_dirs_for_install_path( install_path ) if not args.dry_run: network_args = { "hosts": args.nodes, "binary_dir": binary_dir, "library_dir": library_dir, "txs": txs, "jwt_issuer": jwt_issuer, "version": version, } if idx == 0: LOG.info(f"Starting new service (version: {version})") network = infra.network.Network(**network_args) network.start_and_join(args) else: LOG.info(f"Recovering service (new version: {version})") network = infra.network.Network( **network_args, existing_network=network ) network.start_in_recovery( args, ledger_dir, committed_ledger_dir, snapshot_dir=snapshot_dir, ) network.recover(args) nodes = network.get_joined_nodes() primary, _ = network.find_primary() # Verify that all nodes run the expected CCF version for node in nodes: # Note: /node/version endpoint and custom certificate validity # were added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( r.body.json()["ccf_version"] == expected_version ), f"Node version is not {expected_version}" node.verify_certificate_validity_period() # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) if idx > 0: test_new_service( network, args, install_path, binary_dir, library_dir, version, ) snapshot_dir = ( network.get_committed_snapshots(primary) if use_snapshot else None ) ledger_dir, committed_ledger_dir = primary.get_ledger( include_read_only_dirs=True ) network.stop_all_nodes(skip_verification=True) # Check that ledger and snapshots can be parsed ccf.ledger.Ledger([committed_ledger_dir]).get_latest_public_state() if snapshot_dir: for s in os.listdir(snapshot_dir): with ccf.ledger.Snapshot( os.path.join(snapshot_dir, s) ) as snapshot: snapshot.get_public_domain() return lts_versions