def run(args): args.jwt_key_refresh_interval_s = 1 with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) network = test_jwt_without_key_policy(network, args) network = test_jwt_with_sgx_key_policy(network, args) network = test_jwt_with_sgx_key_filter(network, args) network = test_jwt_key_auto_refresh(network, args) # Check that auto refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary.node_id) network = test_jwt_key_auto_refresh(network, args) args.jwt_key_refresh_interval_s = 100000 with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) network = test_jwt_key_initial_refresh(network, args) # Check that initial refresh also works on backups primary, _ = network.find_primary() primary.stop() network.wait_for_new_primary(primary.node_id) network = test_jwt_key_initial_refresh(network, args)
def run(args): with infra.service_load.load() as load: with infra.network.network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, service_load=load, ) as network: check = infra.checker.Checker() network.start_and_open(args) current_view = None primary, current_view = network.find_primary() # Number of nodes F to stop until network cannot make progress nodes_to_stop = math.ceil(len(args.nodes) / 2) if args.consensus == "BFT": nodes_to_stop = math.ceil(len(args.nodes) / 3) primary_is_known = True for node_to_stop in range(nodes_to_stop): primary, current_view = network.find_primary() LOG.debug( "Commit new transactions, primary:{}, current_view:{}". format(primary.local_node_id, current_view)) with primary.client("user0") as c: res = c.post( "/app/log/private", { "id": current_view, "msg": "This log is committed in view {}".format( current_view), }, ) check(res, result=True) LOG.debug( "Waiting for transaction to be committed by all nodes") network.wait_for_all_nodes_to_commit( tx_id=TxID(res.view, res.seqno)) try: test_kill_primary_no_reqs(network, args) except PrimaryNotFound: if node_to_stop < nodes_to_stop - 1: raise else: primary_is_known = False assert not primary_is_known, "Primary is still known" LOG.success("Test ended successfully.")
def test_kill_primary(network, args, find_new_primary=True): primary, _ = network.find_primary() primary.stop() LOG.debug( f"Waiting {network.election_duration}s for a new primary to be elected..." ) time.sleep(network.election_duration) if find_new_primary: new_primary, new_term = network.find_primary() LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") return network
def test_metrics(network, args): primary, _ = network.find_primary() calls = 0 errors = 0 with primary.client("user0") as c: r = c.get("/app/endpoint_metrics") m = r.body.json()["metrics"]["endpoint_metrics"]["GET"] calls = m["calls"] errors = m["errors"] with primary.client("user0") as c: r = c.get("/app/endpoint_metrics") assert r.body.json( )["metrics"]["endpoint_metrics"]["GET"]["calls"] == calls + 1 r = c.get("/app/endpoint_metrics") assert r.body.json( )["metrics"]["endpoint_metrics"]["GET"]["calls"] == calls + 2 with primary.client() as c: r = c.get("/app/endpoint_metrics", headers={"accept": "nonsense"}) assert r.status_code == http.HTTPStatus.NOT_ACCEPTABLE.value with primary.client() as c: r = c.get("/app/endpoint_metrics") assert (r.body.json()["metrics"]["endpoint_metrics"]["GET"]["errors"] == errors + 1) return network
def test_add_member(network, args): primary, _ = network.find_primary() member_data = { "example": "of", "structured": ["and", { "nested": "arbitrary data" }], } new_member = network.consortium.generate_and_add_new_member( primary, curve=infra.network.ParticipantsCurve(args.participants_curve).next(), member_data=member_data, ) try: new_member.get_and_decrypt_recovery_share( primary, network.store_current_network_encryption_key()) assert False, "New accepted members are not given recovery shares" except infra.member.NoRecoveryShareFound as e: assert e.response.body.text( ) == "Only active members are given recovery shares" new_member.ack(primary) return network
def assert_recovery_shares_update(func, network, args, **kwargs): primary, _ = network.find_primary() recovery_threshold_before = network.consortium.recovery_threshold active_members_before = network.consortium.get_active_members() network.store_current_network_encryption_key() already_active_member = network.consortium.get_any_active_member() defunct_network_enc_pubk = network.store_current_network_encryption_key() saved_share = already_active_member.get_and_decrypt_recovery_share( primary, defunct_network_enc_pubk) if func is test_retire_member: # When retiring a member, the active member which retrieved their share # should not be retired for them to be able to compare their share afterwards. member_to_retire = [ m for m in network.consortium.get_active_members() if m is not already_active_member ][0] func(network, args, member_to_retire) elif func is test_set_recovery_threshold and "recovery_threshold" in kwargs: func(network, args, recovery_threshold=kwargs["recovery_threshold"]) else: func(network, args) if (recovery_threshold_before != network.consortium.recovery_threshold or active_members_before != network.consortium.get_active_members): new_share = already_active_member.get_and_decrypt_recovery_share( primary, defunct_network_enc_pubk) assert saved_share != new_share, "New recovery shares should have been issued"
def test_all_nodes_cert_renewal(network, args, valid_from=None): primary, _ = network.find_primary() valid_from = valid_from or datetime.now() validity_period_days = args.maximum_node_certificate_validity_days self_signed_node_certs_before = {} for node in network.get_joined_nodes(): # Note: GET /node/self_signed_certificate endpoint was added after 2.0.0-r6 if node.version_after("ccf-2.0.0-rc6"): self_signed_node_certs_before[ node.local_node_id] = node.retrieve_self_signed_cert() network.consortium.set_all_nodes_certificate_validity( primary, valid_from=valid_from, validity_period_days=validity_period_days, ) # Node certificates are updated on global commit hook network.wait_for_all_nodes_to_commit(primary) for node in network.get_joined_nodes(): node.set_certificate_validity_period(valid_from, validity_period_days) if node.version_after("ccf-2.0.0-rc6"): assert ( self_signed_node_certs_before[node.local_node_id] != node.retrieve_self_signed_cert() ), f"Self-signed node certificate for node {node.local_node_id} was not renewed"
def test_node_data(network, args): with tempfile.NamedTemporaryFile(mode="w+") as ntf: primary, _ = network.find_primary() with primary.client() as c: def get_nodes(): r = c.get("/node/network/nodes") assert r.status_code == 200, (r.status_code, r.body.text()) return { node_info["node_id"]: node_info for node_info in r.body.json()["nodes"] } new_node_data = {"my_id": "0xdeadbeef", "location": "The Moon"} json.dump(new_node_data, ntf) ntf.flush() untrusted_node = network.create_node( infra.interfaces.HostSpec( rpc_interfaces={ infra.interfaces.PRIMARY_RPC_INTERFACE: infra.interfaces.RPCInterface( endorsement=infra.interfaces.Endorsement( authority=infra.interfaces. EndorsementAuthority.Node)) }), node_data_json_file=ntf.name, ) # NB: This new node joins but is never trusted network.join_node(untrusted_node, args.package, args) nodes = get_nodes() assert untrusted_node.node_id in nodes, nodes new_node_info = nodes[untrusted_node.node_id] assert new_node_info["node_data"] == new_node_data, new_node_info # Set modified node data new_node_data["previous_locations"] = [new_node_data["location"]] new_node_data["location"] = "Secret Base" network.consortium.set_node_data(primary, untrusted_node.node_id, new_node_data) nodes = get_nodes() assert untrusted_node.node_id in nodes, nodes new_node_info = nodes[untrusted_node.node_id] assert new_node_info["node_data"] == new_node_data, new_node_info # Set modified node data on trusted primary primary_node_data = "Some plain JSON string" network.consortium.set_node_data(primary, primary.node_id, primary_node_data) nodes = get_nodes() assert primary.node_id in nodes, nodes primary_node_info = nodes[primary.node_id] assert (primary_node_info["node_data"] == primary_node_data ), primary_node_info return network
def run(args): with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: for node in network.nodes: node.curl = True network.start_and_join(args) primary, _ = network.find_primary() cmd = [ "python", args.client_tutorial, network.common_dir, ] rc = infra.proc.ccall(*cmd).returncode assert rc == 0, f"Failed to run tutorial script: {rc}" cmd = [ "python", args.ledger_tutorial, primary.get_ledger()[1], ] rc = infra.proc.ccall(*cmd).returncode assert rc == 0, f"Failed to run tutorial script: {rc}"
def test_forced_ledger_chunk(network, args): primary, _ = network.find_primary() # Submit some dummy transactions network.txs.issue(network, number_txs=3) # Submit a proposal to force a ledger chunk at the following signature proposal = network.consortium.force_ledger_chunk(primary) # Issue some more transactions network.txs.issue(network, number_txs=5) ledger_dirs = primary.remote.ledger_paths() # Check that there is indeed a ledger chunk that ends at the # first signature after proposal.completed_seqno ledger = ccf.ledger.Ledger(ledger_dirs) chunk, _, last, next_signature = find_ledger_chunk_for_seqno( ledger, proposal.completed_seqno) LOG.info( f"Found ledger chunk {chunk.filename()} with chunking proposal @{proposal.completed_seqno} and signature @{next_signature}" ) assert chunk.is_complete and chunk.is_committed() assert last == next_signature assert next_signature - proposal.completed_seqno < args.sig_tx_interval return network
def test_retire_member(network, args, member_to_retire=None, recovery_member=True): primary, _ = network.find_primary() if member_to_retire is None: member_to_retire = network.consortium.get_any_active_member(recovery_member) network.consortium.retire_member(primary, member_to_retire) return network
def run(args): # Keep track of governance operations that happened in the test governance_operations = set() with infra.network.network(args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) primary, _ = network.find_primary() ledger_directories = primary.remote.ledger_paths() LOG.info("Add new member proposal (implicit vote)") ( new_member_proposal, _, careful_vote, ) = network.consortium.generate_and_propose_new_member( primary, curve=infra.network.EllipticCurve.secp256r1) member = network.consortium.get_member_by_local_id( new_member_proposal.proposer_id) governance_operations.add( (new_member_proposal.proposal_id, member.service_id, "propose")) LOG.info("2/3 members accept the proposal") p = network.consortium.vote_using_majority(primary, new_member_proposal, careful_vote) for voter in p.voters: governance_operations.add((p.proposal_id, voter, "vote")) assert new_member_proposal.state == infra.proposal.ProposalState.ACCEPTED LOG.info("Create new proposal but withdraw it before it is accepted") new_member_proposal, _, _ = network.consortium.generate_and_propose_new_member( primary, curve=infra.network.EllipticCurve.secp256r1) member = network.consortium.get_member_by_local_id( new_member_proposal.proposer_id) governance_operations.add( (new_member_proposal.proposal_id, member.service_id, "propose")) with primary.client() as c: response = network.consortium.get_member_by_local_id( new_member_proposal.proposer_id).withdraw( primary, new_member_proposal) infra.checker.Checker(c)(response) assert response.status_code == http.HTTPStatus.OK.value assert response.body.json()["state"] == ProposalState.WITHDRAWN.value member = network.consortium.get_member_by_local_id( new_member_proposal.proposer_id) governance_operations.add( (new_member_proposal.proposal_id, member.service_id, "withdraw")) # Force ledger flush of all transactions so far network.get_latest_ledger_public_state() ledger = ccf.ledger.Ledger(ledger_directories) check_operations(ledger, governance_operations) test_ledger_is_readable(network, args) test_tables_doc(network, args)
def test_add_as_many_pending_nodes(network, args): # Should not change the raft consensus rules (i.e. majority) primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) check_can_progress(primary) for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) # Stop the retired nodes so they don't linger in the background and interfere # with subsequent tests for new_node in new_nodes: new_node.stop() return network
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.consortium.retire_node(primary, backup_to_retire) backup_to_retire.stop() check_can_progress(primary) return network
def test_illegal(network, args, verify=True): primary, _ = network.find_primary() # Send malformed HTTP traffic and check the connection is closed cafile = os.path.join(network.common_dir, "networkcert.pem") context = ssl.create_default_context(cafile=cafile) context.set_ecdh_curve(ccf.clients.get_curve(cafile).name) context.load_cert_chain( certfile=os.path.join(network.common_dir, "user0_cert.pem"), keyfile=os.path.join(network.common_dir, "user0_privk.pem"), ) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) conn = context.wrap_socket(sock, server_side=False, server_hostname=primary.host) conn.connect((primary.host, primary.pubport)) conn.sendall(b"NOTAVERB ") rv = conn.recv(1024) assert rv == b"", rv # Valid transactions are still accepted network.txs.issue( network=network, number_txs=1, ) network.txs.issue( network=network, number_txs=1, on_backup=True, ) if verify: network.txs.verify() else: LOG.warning("Skipping log messages verification") return network
def test_metrics(network, args): primary, _ = network.find_primary() def get_metrics(r, path, method): return next( v for v in r.body.json()["metrics"] if v["path"] == path and v["method"] == method ) calls = 0 errors = 0 with primary.client("user0") as c: r = c.get("/app/api/metrics") m = get_metrics(r, "api/metrics", "GET") calls = m["calls"] errors = m["errors"] with primary.client("user0") as c: r = c.get("/app/api/metrics") assert get_metrics(r, "api/metrics", "GET")["calls"] == calls + 1 r = c.get("/app/api/metrics") assert get_metrics(r, "api/metrics", "GET")["calls"] == calls + 2 with primary.client() as c: r = c.get("/app/api/metrics", headers={"accept": "nonsense"}) assert r.status_code == http.HTTPStatus.NOT_ACCEPTABLE.value with primary.client() as c: r = c.get("/app/api/metrics") assert get_metrics(r, "api/metrics", "GET")["errors"] == errors + 1 return network
def test(network, args, from_snapshot=False): old_primary, _ = network.find_primary() # Until https://github.com/microsoft/CCF/issues/1539, pause for a # little while to make sure the evidence of the snapshot is committed if from_snapshot: LOG.warning("Pausing for the snapshot evidence to be committed...") time.sleep(2) # Retrieve ledger and snapshots ledger_dir = old_primary.get_ledger() snapshot_dir = None if from_snapshot: snapshot_dir = old_primary.get_snapshots() if not os.listdir(snapshot_dir): raise RuntimeError(f"No snapshot found in {snapshot_dir}") defunct_network_enc_pubk = network.store_current_network_encryption_key() recovered_network = infra.network.Network(network.hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, network) recovered_network.start_in_recovery(args, ledger_dir=ledger_dir, snapshot_dir=snapshot_dir) recovered_network.recover(args, defunct_network_enc_pubk) return recovered_network
def test_invalid_client_signature(network, args): primary, _ = network.find_primary() def post_proposal_request_raw(node, headers=None, expected_error_msg=None): r = requests.post( f"https://{node.get_public_rpc_host()}:{node.get_public_rpc_port()}/gov/proposals", headers=headers, verify=os.path.join(node.common_dir, "service_cert.pem"), ).json() assert r["error"]["code"] == "InvalidAuthenticationInfo" assert ( expected_error_msg in r["error"]["message"] ), f"Expected error message '{expected_error_msg}' not in '{r['error']['message']}'" # Verify that _some_ HTTP signature parsing errors are communicated back to the client post_proposal_request_raw( primary, headers=None, expected_error_msg="Missing signature", ) post_proposal_request_raw( primary, headers={"Authorization": "invalid"}, expected_error_msg="'authorization' header only contains one field", ) post_proposal_request_raw( primary, headers={"Authorization": "invalid invalid"}, expected_error_msg="'authorization' scheme for signature should be 'Signature", ) post_proposal_request_raw( primary, headers={"Authorization": "Signature invalid"}, expected_error_msg="Error verifying HTTP 'digest' header: Missing 'digest' header", )
def test_anonymous_caller(network, args): if args.package == "liblogging": primary, _ = network.find_primary() # Create a new user but do not record its identity network.create_user("user5", args.participants_curve, record=False) log_id = 101 msg = "This message is anonymous" with primary.client("user5") as c: r = c.post("/app/log/private/anonymous", {"id": log_id, "msg": msg}) assert r.body.json() == True r = c.get(f"/app/log/private?id={log_id}") assert r.status_code == http.HTTPStatus.UNAUTHORIZED.value, r with primary.client("user0") as c: r = c.get(f"/app/log/private?id={log_id}") assert msg in r.body.json()["msg"], r else: LOG.warning( f"Skipping {inspect.currentframe().f_code.co_name} as application is not C++" ) return network
def test_metrics(network, args): primary, _ = network.find_primary() calls = 0 errors = 0 with primary.client("user0") as c: r = c.get("/app/endpoint_metrics") m = r.body.json()["metrics"]["endpoint_metrics"]["GET"] calls = m["calls"] errors = m["errors"] with primary.client("user0") as c: r = c.get("/app/endpoint_metrics") assert r.body.json()["metrics"]["endpoint_metrics"]["GET"]["calls"] == calls + 1 r = c.get("/app/endpoint_metrics") assert r.body.json()["metrics"]["endpoint_metrics"]["GET"]["calls"] == calls + 2 with primary.client() as c: r = c.get("/app/endpoint_metrics") assert r.status_code == http.HTTPStatus.FORBIDDEN.value with primary.client("user0") as c: r = c.get("/app/endpoint_metrics") assert ( r.body.json()["metrics"]["endpoint_metrics"]["GET"]["errors"] == errors + 1 ) return network
def run_test_all_members(network): primary, _ = network.find_primary() with primary.client() as c: r = c.get("/gov/members") assert r.status_code == http.HTTPStatus.OK.value response_members = r.body.json() network_members = network.get_members() assert len(network_members) == len(response_members) for member in network_members: assert member.service_id in response_members response_details = response_members[member.service_id] assert response_details["cert"] == member.cert assert ( infra.member.MemberStatus(response_details["status"]) == member.status_code ) assert response_details["member_data"] == member.member_data if member.is_recovery_member: recovery_enc_key = open( member.member_info["encryption_public_key_file"], encoding="utf-8" ).read() assert response_details["public_encryption_key"] == recovery_enc_key else: assert response_details["public_encryption_key"] is None
def assert_recovery_shares_update(are_shared_updated, func, network, args, **kwargs): primary, _ = network.find_primary() saved_recovery_shares = {} for m in network.consortium.get_active_recovery_members(): saved_recovery_shares[m] = m.get_and_decrypt_recovery_share(primary) if func is test_remove_member: recovery_member = kwargs.pop("recovery_member") member_to_remove = network.consortium.get_any_active_member( recovery_member=recovery_member) if recovery_member: saved_recovery_shares.pop(member_to_remove) func(network, args, member_to_remove) elif func is test_set_recovery_threshold and "recovery_threshold" in kwargs: func(network, args, recovery_threshold=kwargs["recovery_threshold"]) else: func(network, args, **kwargs) for m, share_before in saved_recovery_shares.items(): if are_shared_updated: assert share_before != m.get_and_decrypt_recovery_share(primary) else: assert share_before == m.get_and_decrypt_recovery_share(primary)
def check(network, args, *nargs, **kwargs): primary, _ = network.find_primary() with primary.client( network.consortium.get_any_active_member().local_id) as c: r = c.post( "/gov/query", { "text": """tables = ... trusted_nodes_count = 0 tables["public:ccf.gov.nodes.info"]:foreach(function(node_id, details) if details["status"] == "TRUSTED" then trusted_nodes_count = trusted_nodes_count + 1 end end) return trusted_nodes_count """ }, ) trusted_nodes_count = r.body.json() running_nodes_count = len(network.get_joined_nodes()) would_leave_nodes_count = running_nodes_count - nodes_to_kill_count minimum_nodes_to_run_count = ceil((trusted_nodes_count + 1) / 2) if args.consensus == "cft" and (would_leave_nodes_count < minimum_nodes_to_run_count): raise TestRequirementsNotMet( f"Cannot kill {nodes_to_kill_count} node(s) as the network would not be able to make progress" f" (would leave {would_leave_nodes_count} nodes but requires {minimum_nodes_to_run_count} nodes to make progress) " )
def test_kill_primary(network, args): primary, _ = network.find_primary() primary.stop() new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") return network
def test_user_data_ACL(network, args): if args.package == "liblogging": primary, _ = network.find_primary() user = network.users[0] # Give isAdmin permissions to a single user network.consortium.set_user_data( primary, user.service_id, user_data={"isAdmin": True} ) # Confirm that user can now use this endpoint with primary.client(user.local_id) as c: r = c.post("/app/log/private/admin_only", {"id": 42, "msg": "hello world"}) assert r.status_code == http.HTTPStatus.OK.value, r.status_code # Remove permission network.consortium.set_user_data( primary, user.service_id, user_data={"isAdmin": False} ) # Confirm that user is now forbidden on this endpoint with primary.client(user.local_id) as c: r = c.post("/app/log/private/admin_only", {"id": 42, "msg": "hello world"}) assert r.status_code == http.HTTPStatus.FORBIDDEN.value, r.status_code else: LOG.warning( f"Skipping {inspect.currentframe().f_code.co_name} as application is not C++" ) return network
def test_recover_service_with_expired_cert(args): expired_service_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), "expired_service" ) new_common = infra.network.get_common_folder_name(args.workspace, args.label) copy_tree(os.path.join(expired_service_dir, "common"), new_common) network = infra.network.Network(args.nodes, args.binary_dir) args.previous_service_identity_file = os.path.join( expired_service_dir, "common", "service_cert.pem" ) network.start_in_recovery( args, ledger_dir=os.path.join(expired_service_dir, "0.ledger"), committed_ledger_dirs=[os.path.join(expired_service_dir, "0.ledger")], snapshots_dir=os.path.join(expired_service_dir, "0.snapshots"), common_dir=new_common, ) network.recover(args) primary, _ = network.find_primary() infra.checker.check_can_progress(primary) r = primary.get_receipt(2, 3) verify_receipt(r.json(), network.cert)
def test_add_as_many_pending_nodes(network, args): # Killing pending nodes should not change the raft consensus rules primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) for new_node in new_nodes: new_node.stop() # Even though pending nodes (half the number of nodes) are stopped, # service can still make progress check_can_progress(primary) # Cleanup killed pending nodes for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) return network
def test_corrupted_signature(network, args): primary, _ = network.find_primary() # Test each supported curve for curve in infra.network.ParticipantsCurve: LOG.info(f"Testing curve: {curve.name}") # Add a member so we have at least one on this curve member = network.consortium.generate_and_add_new_member( primary, curve=curve, ) with primary.client(*member.auth(write=True)) as mc: # pylint: disable=protected-access # Cache the original auth provider original_auth = ccf.clients.RequestClient._auth_provider # Override the auth provider with invalid ones for fn in (missing_signature, empty_signature, modified_signature): ccf.clients.RequestClient._auth_provider = make_signature_corrupter(fn) r = mc.post("/gov/proposals") assert r.status_code == http.HTTPStatus.UNAUTHORIZED, r.status_code # Restore original auth provider for future calls! ccf.clients.RequestClient._auth_provider = original_auth # Remove the new member once we're done with them network.consortium.retire_member(primary, member) return network
def test_share_resilience(network, args, from_snapshot=False): old_primary, _ = network.find_primary() snapshot_dir = None if from_snapshot: snapshot_dir = network.get_committed_snapshots(old_primary) current_ledger_dir, committed_ledger_dir = old_primary.get_ledger( include_read_only_dirs=True ) recovered_network = infra.network.Network( args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, network ) recovered_network.start_in_recovery( args, ledger_dir=current_ledger_dir, committed_ledger_dir=committed_ledger_dir, snapshot_dir=snapshot_dir, ) primary, _ = recovered_network.find_primary() recovered_network.consortium.accept_recovery(primary) # Submit all required recovery shares minus one. Last recovery share is # submitted after a new primary is found. submitted_shares_count = 0 for m in recovered_network.consortium.get_active_members(): with primary.client() as nc: if ( submitted_shares_count >= recovered_network.consortium.recovery_threshold - 1 ): last_member_to_submit = m break check_commit = infra.checker.Checker(nc) check_commit(m.get_and_submit_recovery_share(primary)) submitted_shares_count += 1 LOG.info( f"Shutting down node {primary.node_id} before submitting last recovery share" ) primary.stop() new_primary, _ = recovered_network.wait_for_new_primary(primary.node_id) assert ( new_primary is not primary ), f"Primary {primary.node_id} should have changed after election" last_member_to_submit.get_and_submit_recovery_share(new_primary) for node in recovered_network.get_joined_nodes(): recovered_network.wait_for_state( node, "partOfNetwork", timeout=args.ledger_recovery_timeout ) recovered_network.consortium.check_for_service( new_primary, infra.network.ServiceStatus.OPEN, ) return recovered_network
def run(args): # Three nodes minimum to make sure that the raft network can still make progress # if one node stops hosts = ["localhost"] * (4 if args.consensus == "bft" else 3) with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: check = infra.checker.Checker() network.start_and_join(args) current_view = None # Number of nodes F to stop until network cannot make progress nodes_to_stop = math.ceil(len(hosts) / 2) if args.consensus == "bft": nodes_to_stop = math.ceil(len(hosts) / 3) primary_is_known = True for node_to_stop in range(nodes_to_stop): # Note that for the first iteration, the primary is known in advance anyway LOG.debug("Find freshly elected primary") # After a view change in pbft, finding the new primary takes longer primary, current_view = network.find_primary( timeout=(30 if args.consensus == "bft" else 3)) LOG.debug( "Commit new transactions, primary:{}, current_view:{}".format( primary.node_id, current_view)) with primary.client("user0") as c: res = c.post( "/app/log/private", { "id": current_view, "msg": "This log is committed in view {}".format( current_view), }, ) check(res, result=True) seqno = res.seqno LOG.debug("Waiting for transaction to be committed by all nodes") wait_for_seqno_to_commit(seqno, current_view, network.get_joined_nodes()) try: test_kill_primary(network, args) except PrimaryNotFound: if node_to_stop < nodes_to_stop - 1: raise else: primary_is_known = False assert not primary_is_known, "Primary is still known" LOG.success("Test ended successfully.")