def test_node_replacement(network, args): primary, backups = network.find_nodes() node_to_replace = backups[-1] LOG.info(f"Retiring node {node_to_replace.local_node_id}") network.retire_node(primary, node_to_replace) node_to_replace.stop() check_can_progress(primary) LOG.info("Adding one node on same address as retired node") replacement_node = network.create_node( f"local://{node_to_replace.rpc_host}:{node_to_replace.rpc_port}", node_port=node_to_replace.node_port, ) network.join_node(replacement_node, args.package, args, from_snapshot=False) network.trust_node(replacement_node, args) assert replacement_node.node_id != node_to_replace.node_id assert replacement_node.rpc_host == node_to_replace.rpc_host assert replacement_node.node_port == node_to_replace.node_port assert replacement_node.rpc_port == node_to_replace.rpc_port allowed_to_suspend_count = network.get_f() - len(network.get_stopped_nodes()) backups_to_suspend = backups[:allowed_to_suspend_count] LOG.info( f"Suspending {len(backups_to_suspend)} other nodes to make progress depend on the replacement" ) for other_backup in backups_to_suspend: other_backup.suspend() # Confirm the network can make progress check_can_progress(primary) for other_backup in backups_to_suspend: other_backup.resume() return network
def test_add_as_many_pending_nodes(network, args): # Killing pending nodes should not change the raft consensus rules primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) for new_node in new_nodes: new_node.stop() # Even though pending nodes (half the number of nodes) are stopped, # service can still make progress check_can_progress(primary) # Cleanup killed pending nodes for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) return network
def test_add_as_many_pending_nodes(network, args): # Should not change the raft consensus rules (i.e. majority) primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) check_can_progress(primary) for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) # Stop the retired nodes so they don't linger in the background and interfere # with subsequent tests for new_node in new_nodes: new_node.stop() return network
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.retire_node(primary, backup_to_retire) backup_to_retire.stop() check_can_progress(primary) wait_for_reconfiguration_to_complete(network) return network
def test_new_joiner_helps_liveness(network, args): primary, backups = network.find_nodes() # Issue some transactions, so there is a ledger history that a new node must receive network.txs.issue(network, number_txs=10) # Remove a node, leaving the network frail network.retire_node(primary, backups[-1]) backups[-1].stop() primary, backups = network.find_nodes() with contextlib.ExitStack() as stack: # Add a new node, but partition them before trusting them new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_joiner_partition = [new_node] new_joiner_rules = stack.enter_context( network.partitioner.partition([primary, *backups], new_joiner_partition)) # Trust the new node, and wait for commit of this (but don't ask the new node itself, which doesn't know this yet) network.trust_node(new_node, args, no_wait=True) check_can_progress(primary) # Partition the primary, temporarily creating a minority service that cannot make progress minority_partition = backups[len(backups) // 2:] + new_joiner_partition minority_rules = stack.enter_context( network.partitioner.partition(minority_partition)) # This is an unusual situation, where we've actually produced a dead partitioned node. # Initially any write requests will timeout (failed attempt at forwarding), and then # the node transitions to a candidate with nobody to talk to. Rather than trying to # catch the errors of these states quickly, we just sleep until the latter state is # reached, and then confirm it was reached. time.sleep(network.observed_election_duration) with backups[0].client("user0") as c: r = c.post("/app/log/private", {"id": 42, "msg": "Hello world"}) assert r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE # Restore the new node to the service new_joiner_rules.drop() # Confirm that the new node catches up, and progress can be made in this majority partition network.wait_for_new_primary(primary, minority_partition) check_can_progress(new_node) # Explicitly drop rules before continuing minority_rules.drop() network.wait_for_primary_unanimity() primary, _ = network.find_nodes() network.wait_for_all_nodes_to_commit(primary=primary)
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.retire_node(primary, backup_to_retire) network.wait_for_node_in_store( primary, backup_to_retire.node_id, node_status=None, timeout=3, ) backup_to_retire.stop() check_can_progress(primary) wait_for_reconfiguration_to_complete(network) return network
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. network.wait_for_new_primary(primary, nodes=[backup]) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def test_new_service( network, args, install_path, binary_dir, library_dir, version, cycle_existing_nodes=False, ): LOG.info("Update constitution") primary, _ = network.find_primary() new_constitution = get_new_constitution_for_install(args, install_path) network.consortium.set_constitution(primary, new_constitution) # Note: Changes to constitution between versions should be tested here LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]") nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else [] nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1 for _ in range(0, nodes_to_add_count): new_node = network.create_node( "local://localhost", binary_dir=binary_dir, library_dir=library_dir, version=version, ) network.join_node(new_node, args.package, args) network.trust_node(new_node, args) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS ) for node in nodes_to_cycle: network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) node.stop() test_all_nodes_cert_renewal(network, args) LOG.info("Apply transactions to new nodes only") issue_activity_on_live_service(network, args) test_random_receipts(network, args, lts=True)
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup]) # The old primary should automatically be removed from the store # once a new primary is elected network.wait_for_node_in_store( new_primary, primary.node_id, node_status=None, timeout=3, ) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def test_new_service( network, args, install_path, binary_dir, library_dir, version, cycle_existing_nodes=False, ): LOG.info("Update constitution") primary, _ = network.find_primary() new_constitution = get_new_constitution_for_install(args, install_path) network.consortium.set_constitution(primary, new_constitution) all_nodes = network.get_joined_nodes() # Note: Changes to constitution between versions should be tested here LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]") nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else [] nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1 # Pre-2.0 nodes require X509 time format valid_from = str(infra.crypto.datetime_to_X509time( datetime.datetime.now())) for _ in range(0, nodes_to_add_count): new_node = network.create_node( "local://localhost", binary_dir=binary_dir, library_dir=library_dir, version=version, ) network.join_node(new_node, args.package, args) network.trust_node( new_node, args, valid_from=valid_from, ) new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS ) all_nodes.append(new_node) for node in nodes_to_cycle: network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) node.stop() test_all_nodes_cert_renewal(network, args, valid_from=valid_from) test_service_cert_renewal(network, args, valid_from=valid_from) LOG.info("Waiting for retired nodes to be automatically removed") for node in all_nodes: network.wait_for_node_in_store( primary, node.node_id, node_status=ccf.ledger.NodeStatus.TRUSTED if node.is_joined() else None, ) if args.check_2tx_reconfig_migration: test_migration_2tx_reconfiguration( network, args, initial_is_1tx=False, # Reconfiguration type added in 2.x binary_dir=binary_dir, library_dir=library_dir, version=version, valid_from=valid_from, ) LOG.info("Apply transactions to new nodes only") issue_activity_on_live_service(network, args) test_random_receipts(network, args, lts=True)
def run_code_upgrade_from( args, from_install_path, to_install_path, from_version=None, to_version=None, from_container_image=None, ): from_binary_dir, from_library_dir = get_bin_and_lib_dirs_for_install_path( from_install_path) to_binary_dir, to_library_dir = get_bin_and_lib_dirs_for_install_path( to_install_path) set_js_args(args, from_install_path, to_install_path) jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) with infra.network.network( args.nodes, binary_directory=from_binary_dir, library_directory=from_library_dir, pdb=args.pdb, txs=txs, jwt_issuer=jwt_issuer, version=from_version, ) as network: network.start_and_open(args, node_container_image=from_container_image) old_nodes = network.get_joined_nodes() primary, _ = network.find_primary() LOG.info("Apply transactions to old service") issue_activity_on_live_service(network, args) new_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=to_library_dir, ) network.consortium.add_new_code(primary, new_code_id) # Note: alternate between joining from snapshot and replaying entire ledger new_nodes = [] from_snapshot = True for _ in range(0, len(old_nodes)): new_node = network.create_node( "local://localhost", binary_dir=to_binary_dir, library_dir=to_library_dir, version=to_version, ) network.join_node(new_node, args.package, args, from_snapshot=from_snapshot) network.trust_node( new_node, args, valid_from=str( # Pre-2.0 nodes require X509 time format infra.crypto.datetime_to_X509time( datetime.datetime.now())), ) # For 2.x nodes joining a 1.x service before the constitution is updated, # the node certificate validity period is set by the joining node itself # as [node startup time, node startup time + 365 days] new_node.verify_certificate_validity_period( expected_validity_period_days= DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS, ignore_proposal_valid_from=True, ) from_snapshot = not from_snapshot new_nodes.append(new_node) # Verify that all nodes run the expected CCF version for node in network.get_joined_nodes(): # Note: /node/version endpoint was added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( version == expected_version ), f"For node {node.local_node_id}, expect version {expected_version}, got {version}" LOG.info( "Apply transactions to hybrid network, with primary as old node" ) issue_activity_on_live_service(network, args) old_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=from_library_dir, ) primary, _ = network.find_primary() network.consortium.retire_code(primary, old_code_id) for index, node in enumerate(old_nodes): network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) # This block is here to test the transition period from a network that # does not support custom claims to one that does. It can be removed after # the transition is complete. # # The new build, being unreleased, doesn't have a version at all if not primary.major_version: LOG.info("Upgrade to new JS app") # Upgrade to a version of the app containing an endpoint that # registers custom claims network.consortium.set_js_app_from_dir( primary, args.new_js_app_bundle) LOG.info("Run transaction with additional claim") # With wait_for_sync, the client checks that all nodes, including # the minority of old ones, have acked the transaction msg_idx = network.txs.idx + 1 txid = network.txs.issue(network, number_txs=1, record_claim=True, wait_for_sync=True) assert len(network.txs.pub[msg_idx]) == 1 claims = network.txs.pub[msg_idx][-1]["msg"] LOG.info( "Check receipts are fine, including transaction with claims" ) test_random_receipts( network, args, lts=True, additional_seqnos={txid.seqno: claims.encode()}, ) # Also check receipts on an old node if index + 1 < len(old_nodes): next_node = old_nodes[index + 1] test_random_receipts( network, args, lts=True, additional_seqnos={txid.seqno: None}, node=next_node, ) node.stop() LOG.info("Service is now made of new nodes only") # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh if not os.getenv("CONTAINER_NODES"): jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) else: # https://github.com/microsoft/CCF/issues/2608#issuecomment-924785744 LOG.warning( "Skipping JWT refresh as running nodes in container") # Code update from 1.x to 2.x requires cycling the freshly-added 2.x nodes # once. This is because 2.x nodes will not have an endorsed certificate # recorded in the store and thus will not be able to have their certificate # refreshed, etc. test_new_service( network, args, to_install_path, to_binary_dir, to_library_dir, to_version, cycle_existing_nodes=True, ) # Check that the ledger can be parsed network.get_latest_ledger_public_state()
def test_update_all_nodes(network, args): replacement_package = get_replacement_package(args) primary, _ = network.find_nodes() first_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, args.package) new_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, replacement_package) if args.enclave_type == "virtual": # Pretend this was already present network.consortium.add_new_code(primary, first_code_id) LOG.info("Add new code id") network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "AllowedToJoin" }, { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info("Remove old code id") network.consortium.retire_code(primary, first_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions old_nodes = network.nodes.copy() LOG.info("Start fresh nodes running new code") for _ in range(0, len(old_nodes)): new_node = network.create_node("local://localhost") network.join_node(new_node, replacement_package, args) network.trust_node(new_node, args) LOG.info("Retire original nodes running old code") for node in old_nodes: primary, _ = network.find_nodes() network.retire_node(primary, node) # Elections take (much) longer than a backup removal which is just # a commit, so we need to adjust our timeout accordingly, hence this branch if node.node_id == primary.node_id: new_primary, _ = network.wait_for_new_primary(primary) primary = new_primary node.stop() LOG.info("Check the network is still functional") check_can_progress(new_node) return network
def run_code_upgrade_from( args, from_install_path, to_install_path, from_version=None, to_version=None, ): from_binary_dir, from_library_dir = get_bin_and_lib_dirs_for_install_path( from_install_path ) to_binary_dir, to_library_dir = get_bin_and_lib_dirs_for_install_path( to_install_path ) set_js_args(args, from_install_path) jwt_issuer = infra.jwt_issuer.JwtIssuer( "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s ) with jwt_issuer.start_openid_server(): txs = app.LoggingTxs(jwt_issuer=jwt_issuer) with infra.network.network( args.nodes, binary_directory=from_binary_dir, library_directory=from_library_dir, pdb=args.pdb, txs=txs, jwt_issuer=jwt_issuer, version=from_version, ) as network: network.start_and_join(args) old_nodes = network.get_joined_nodes() primary, _ = network.find_primary() LOG.info("Apply transactions to old service") issue_activity_on_live_service(network, args) new_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=to_library_dir, ) network.consortium.add_new_code(primary, new_code_id) # Note: alternate between joining from snapshot and replaying entire ledger new_nodes = [] from_snapshot = True for _ in range(0, len(old_nodes)): new_node = network.create_node( "local://localhost", binary_dir=to_binary_dir, library_dir=to_library_dir, version=to_version, ) network.join_node( new_node, args.package, args, from_snapshot=from_snapshot ) network.trust_node(new_node, args) # For 2.x nodes joining a 1.x service before the constitution is update, # the node certificate validity period is set by the joining node itself # as [node startup time, node startup time + 365 days] new_node.verify_certificate_validity_period( expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS, ignore_proposal_valid_from=True, ) from_snapshot = not from_snapshot new_nodes.append(new_node) # Verify that all nodes run the expected CCF version for node in network.get_joined_nodes(): # Note: /node/version endpoint was added in 2.x if not node.major_version or node.major_version > 1: with node.client() as c: r = c.get("/node/version") expected_version = node.version or args.ccf_version version = r.body.json()["ccf_version"] assert ( version == expected_version ), f"For node {node.local_node_id}, expect version {expected_version}, got {version}" LOG.info("Apply transactions to hybrid network, with primary as old node") issue_activity_on_live_service(network, args) old_code_id = infra.utils.get_code_id( args.enclave_type, args.oe_binary, args.package, library_dir=from_library_dir, ) primary, _ = network.find_primary() network.consortium.retire_code(primary, old_code_id) for node in old_nodes: network.retire_node(primary, node) if primary == node: primary, _ = network.wait_for_new_primary(primary) node.stop() LOG.info("Service is now made of new nodes only") # Rollover JWKS so that new primary must read historical CA bundle table # and retrieve new keys via auto refresh jwt_issuer.refresh_keys() # Note: /gov/jwt_keys/all endpoint was added in 2.x primary, _ = network.find_nodes() if not primary.major_version or primary.major_version > 1: jwt_issuer.wait_for_refresh(network) else: time.sleep(3) # Code update from 1.x to 2.x requires cycling the freshly-added 2.x nodes # once. This is because 2.x nodes will not have an endorsed certificate # recorded in the store and thus will not be able to have their certificate # refreshed, etc. test_new_service( network, args, to_install_path, to_binary_dir, to_library_dir, to_version, cycle_existing_nodes=True, ) # Check that the ledger can be parsed network.get_latest_ledger_public_state()