def test_add_as_many_pending_nodes(network, args): # Killing pending nodes should not change the raft consensus rules primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) for new_node in new_nodes: new_node.stop() # Even though pending nodes (half the number of nodes) are stopped, # service can still make progress check_can_progress(primary) # Cleanup killed pending nodes for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) return network
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.consortium.retire_node(primary, backup_to_retire) backup_to_retire.stop() check_can_progress(primary) return network
def test_add_as_many_pending_nodes(network, args): # Should not change the raft consensus rules (i.e. majority) primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [] for _ in range(number_new_nodes): new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_nodes.append(new_node) check_can_progress(primary) for new_node in new_nodes: network.retire_node(primary, new_node) wait_for_reconfiguration_to_complete(network) # Stop the retired nodes so they don't linger in the background and interfere # with subsequent tests for new_node in new_nodes: new_node.stop() return network
def test_node_replacement(network, args): primary, backups = network.find_nodes() node_to_replace = backups[-1] LOG.info(f"Retiring node {node_to_replace.local_node_id}") network.retire_node(primary, node_to_replace) node_to_replace.stop() check_can_progress(primary) LOG.info("Adding one node on same address as retired node") replacement_node = network.create_node( f"local://{node_to_replace.rpc_host}:{node_to_replace.rpc_port}", node_port=node_to_replace.node_port, ) network.join_node(replacement_node, args.package, args, from_snapshot=False) network.trust_node(replacement_node, args) assert replacement_node.node_id != node_to_replace.node_id assert replacement_node.rpc_host == node_to_replace.rpc_host assert replacement_node.node_port == node_to_replace.node_port assert replacement_node.rpc_port == node_to_replace.rpc_port allowed_to_suspend_count = network.get_f() - len(network.get_stopped_nodes()) backups_to_suspend = backups[:allowed_to_suspend_count] LOG.info( f"Suspending {len(backups_to_suspend)} other nodes to make progress depend on the replacement" ) for other_backup in backups_to_suspend: other_backup.suspend() # Confirm the network can make progress check_can_progress(primary) for other_backup in backups_to_suspend: other_backup.resume() return network
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.retire_node(primary, backup_to_retire) backup_to_retire.stop() check_can_progress(primary) wait_for_reconfiguration_to_complete(network) return network
def test_suspend_primary(network, args): primary, _ = network.find_primary() primary.suspend() new_primary, _ = network.wait_for_new_primary(primary) check_can_progress(new_primary) primary.resume() check_can_progress(new_primary) return network
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.consortium.retire_node(primary, primary) network.wait_for_new_primary(primary) check_can_progress(backup) network.nodes.remove(primary) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() return network
def test_new_joiner_helps_liveness(network, args): primary, backups = network.find_nodes() # Issue some transactions, so there is a ledger history that a new node must receive network.txs.issue(network, number_txs=10) # Remove a node, leaving the network frail network.retire_node(primary, backups[-1]) backups[-1].stop() primary, backups = network.find_nodes() with contextlib.ExitStack() as stack: # Add a new node, but partition them before trusting them new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) new_joiner_partition = [new_node] new_joiner_rules = stack.enter_context( network.partitioner.partition([primary, *backups], new_joiner_partition)) # Trust the new node, and wait for commit of this (but don't ask the new node itself, which doesn't know this yet) network.trust_node(new_node, args, no_wait=True) check_can_progress(primary) # Partition the primary, temporarily creating a minority service that cannot make progress minority_partition = backups[len(backups) // 2:] + new_joiner_partition minority_rules = stack.enter_context( network.partitioner.partition(minority_partition)) # This is an unusual situation, where we've actually produced a dead partitioned node. # Initially any write requests will timeout (failed attempt at forwarding), and then # the node transitions to a candidate with nobody to talk to. Rather than trying to # catch the errors of these states quickly, we just sleep until the latter state is # reached, and then confirm it was reached. time.sleep(network.observed_election_duration) with backups[0].client("user0") as c: r = c.post("/app/log/private", {"id": 42, "msg": "Hello world"}) assert r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE # Restore the new node to the service new_joiner_rules.drop() # Confirm that the new node catches up, and progress can be made in this majority partition network.wait_for_new_primary(primary, minority_partition) check_can_progress(new_node) # Explicitly drop rules before continuing minority_rules.drop() network.wait_for_primary_unanimity() primary, _ = network.find_nodes() network.wait_for_all_nodes_to_commit(primary=primary)
def test_retire_backup(network, args): primary, _ = network.find_primary() backup_to_retire = network.find_any_backup() network.retire_node(primary, backup_to_retire) network.wait_for_node_in_store( primary, backup_to_retire.node_id, node_status=None, timeout=3, ) backup_to_retire.stop() check_can_progress(primary) wait_for_reconfiguration_to_complete(network) return network
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. network.wait_for_new_primary(primary, nodes=[backup]) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def test_learner_does_not_take_part(network, args): primary, backups = network.find_nodes() f_backups = backups[:network.get_f() + 1] new_node = network.create_node("local://localhost") network.join_node(new_node, args.package, args, from_snapshot=False) with network.partitioner.partition(f_backups): check_does_not_progress(primary, timeout=5) try: network.consortium.trust_node( primary, new_node.node_id, timeout=ceil(args.join_timer * 2 / 1000), valid_from=str( infra.crypto.datetime_to_X509time(datetime.now())), ) new_node.wait_for_node_to_join(timeout=ceil(args.join_timer * 2 / 1000)) join_failed = False except Exception: join_failed = True if not join_failed: raise Exception("join succeeded unexpectedly") with new_node.client(self_signed_ok=True) as c: r = c.get("/node/network/nodes/self") assert r.body.json()["status"] == "Learner" r = c.get("/node/consensus") assert new_node.node_id in r.body.json()["details"]["learners"] # New node joins, but cannot be promoted to TRUSTED without f other backups check_does_not_progress(primary, timeout=5) with new_node.client(self_signed_ok=True) as c: r = c.get("/node/network/nodes/self") assert r.body.json()["status"] == "Learner" r = c.get("/node/consensus") assert new_node.node_id in r.body.json()["details"]["learners"] network.wait_for_primary_unanimity() primary, _ = network.find_nodes() network.wait_for_all_nodes_to_commit(primary=primary) check_can_progress(primary)
def test_isolate_and_reconnect_primary(network, args, **kwargs): primary, backups = network.find_nodes() with network.partitioner.partition(backups): lost_tx_resp = check_does_not_progress(primary) new_primary, _ = network.wait_for_new_primary(primary, nodes=backups, timeout_multiplier=6) new_tx_resp = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: try: # There will be at least one full election cycle for nothing, where the # re-joining node fails to get elected but causes others to rev up their # term. After that, a successful election needs to take place, and we # arbitrarily allow 3 time periods to avoid being too brittle when # raft timeouts line up badly. c.wait_for_commit(new_tx_resp, timeout=(network.election_duration * 4)) except TimeoutError: details = c.get("/node/consensus").body.json() assert ( False ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}" # Check it has dropped anything submitted while partitioned r = c.get( f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}" ) status = TxStatus(r.body.json()["status"]) assert status == TxStatus.Invalid, r
def test_add_as_many_pending_nodes(network, args): # Should not change the raft consensus rules (i.e. majority) primary, _ = network.find_primary() number_new_nodes = len(network.nodes) LOG.info( f"Adding {number_new_nodes} pending nodes - consensus rules should not change" ) new_nodes = [ network.create_and_add_pending_node( args.package, "local://localhost", args, ) for _ in range(number_new_nodes) ] check_can_progress(primary) for new_node in new_nodes: network.consortium.retire_node(primary, new_node) network.nodes.remove(new_node) return network
def test_verify_quotes(network, args): if args.enclave_type == "virtual": LOG.warning("Skipping quote test with virtual enclave") return network LOG.info("Check the network is stable") primary, _ = network.find_primary() check_can_progress(primary) for node in network.get_joined_nodes(): LOG.info(f"Verifying quote for node {node.node_id}") cafile = os.path.join(network.common_dir, "service_cert.pem") assert (infra.proc.ccall( "verify_quote.sh", f"https://{node.get_public_rpc_host()}:{node.get_public_rpc_port()}", "--cacert", f"{cafile}", log_output=True, ).returncode == 0 ), f"Quote verification for node {node.node_id} failed" return network
def test_node_replacement(network, args): primary, backups = network.find_nodes() nodes = network.get_joined_nodes() node_to_replace = backups[-1] f = infra.e2e_args.max_f(args, len(nodes)) f_backups = backups[:f] # Retire one node network.consortium.retire_node(primary, node_to_replace) node_to_replace.stop() network.nodes.remove(node_to_replace) check_can_progress(primary) # Add in a node using the same address replacement_node = network.create_and_trust_node( args.package, f"local://{node_to_replace.host}:{node_to_replace.rpc_port}", args, node_port=node_to_replace.node_port, from_snapshot=False, ) assert replacement_node.node_id != node_to_replace.node_id assert replacement_node.host == node_to_replace.host assert replacement_node.node_port == node_to_replace.node_port assert replacement_node.rpc_port == node_to_replace.rpc_port LOG.info( f"Stopping {len(f_backups)} other nodes to make progress depend on the replacement" ) for other_backup in f_backups: other_backup.suspend() # Confirm the network can make progress check_can_progress(primary) for other_backup in f_backups: other_backup.resume() return network
def test_retire_primary(network, args): pre_count = count_nodes(node_configs(network), network) primary, backup = network.find_primary_and_any_backup() network.retire_node(primary, primary, timeout=15) # Query this backup to find the new primary. If we ask any other # node, then this backup may not know the new primary by the # time we call check_can_progress. new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup]) # The old primary should automatically be removed from the store # once a new primary is elected network.wait_for_node_in_store( new_primary, primary.node_id, node_status=None, timeout=3, ) check_can_progress(backup) post_count = count_nodes(node_configs(network), network) assert pre_count == post_count + 1 primary.stop() wait_for_reconfiguration_to_complete(network) return network
def test_isolate_and_reconnect_primary(network, args): primary, backups = network.find_nodes() with network.partitioner.partition(backups): new_primary, _ = network.wait_for_new_primary( primary, nodes=backups, timeout_multiplier=6 ) new_tx = check_can_progress(new_primary) # Check reconnected former primary has caught up with primary.client() as c: r = c.get("/node/commit") timeout = 5 end_time = time.time() + timeout while time.time() < end_time: current_tx = TxID.from_str( c.get("/node/commit").body.json()["transaction_id"] ) if current_tx.seqno >= new_tx.seqno: return network time.sleep(0.1) assert False, f"Stuck at {r}"
def test_learner_does_not_take_part(network, args): primary, backups = network.find_nodes() f_backups = backups[:network.get_f() + 1] # Note: host is supplied explicitly to avoid having differently # assigned IPs for the interfaces, something which the test infra doesn't # support widely yet. operator_rpc_interface = "operator_rpc_interface" host = infra.net.expand_localhost() new_node = network.create_node( infra.interfaces.HostSpec( rpc_interfaces={ infra.interfaces.PRIMARY_RPC_INTERFACE: infra.interfaces.RPCInterface(host=host), operator_rpc_interface: infra.interfaces.RPCInterface( host=host, endorsement=infra.interfaces.Endorsement( authority=infra.interfaces.EndorsementAuthority.Node), ), })) network.join_node(new_node, args.package, args, from_snapshot=False) LOG.info("Wait for all nodes to have committed join of new pending node") network.wait_for_all_nodes_to_commit(primary=primary) # Here, we partition a majority of backups. This is very intentional so that # the new learner node is not promoted to trusted while the partition is up. # However, this means that the isolated majority of backups can (and will) # elect one of them as new primary while the partition is up. When the partition # is lifted, all the transactions executed of the primary node (including # trusting the new node) will be rolled back. Because of this, we issue a new # trust_node proposal to make sure the new node ends up being trusted and joins # successfully. with network.partitioner.partition(f_backups): check_does_not_progress(primary, timeout=5) try: network.consortium.trust_node( primary, new_node.node_id, timeout=ceil(args.join_timer_s * 2), valid_from=datetime.now(), ) except TimeoutError: LOG.info("Trust node proposal did not commit as expected") else: raise Exception("Trust node proposal committed unexpectedly") check_does_not_progress(primary, timeout=5) LOG.info("Majority partition can make progress") partition_primary, _ = network.wait_for_new_primary(primary, nodes=f_backups) check_can_progress(partition_primary) LOG.info( "New joiner is not promoted to Trusted without f other backups") with new_node.client(interface_name=operator_rpc_interface, verify_ca=False) as c: r = c.get("/node/network/nodes/self") assert r.body.json()["status"] == "Learner" r = c.get("/node/consensus") assert new_node.node_id in r.body.json()["details"]["learners"] LOG.info( "Partition is lifted, wait for primary unanimity on original nodes") # Note: Because trusting the new node failed, the new node is not considered # in the primary unanimity. Indeed, its transition to Trusted may have been rolled back. primary = network.wait_for_primary_unanimity() network.wait_for_all_nodes_to_commit(primary=primary) LOG.info("Trust new joiner again") network.trust_node(new_node, args) check_can_progress(primary) check_can_progress(new_node)
def test_update_all_nodes(network, args): replacement_package = get_replacement_package(args) primary, _ = network.find_nodes() first_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, args.package) new_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary, replacement_package) if args.enclave_type == "virtual": # Pretend this was already present network.consortium.add_new_code(primary, first_code_id) LOG.info("Add new code id") network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "AllowedToJoin" }, { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info("Remove old code id") network.consortium.retire_code(primary, first_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": new_code_id, "status": "AllowedToJoin" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions old_nodes = network.nodes.copy() LOG.info("Start fresh nodes running new code") for _ in range(0, len(old_nodes)): new_node = network.create_node("local://localhost") network.join_node(new_node, replacement_package, args) network.trust_node(new_node, args) LOG.info("Retire original nodes running old code") for node in old_nodes: primary, _ = network.find_nodes() network.retire_node(primary, node) # Elections take (much) longer than a backup removal which is just # a commit, so we need to adjust our timeout accordingly, hence this branch if node.node_id == primary.node_id: new_primary, _ = network.wait_for_new_primary(primary) primary = new_primary node.stop() LOG.info("Check the network is still functional") check_can_progress(new_node) return network