def run_join_old_snapshot(args): txs = app.LoggingTxs() nodes = ["local://localhost"] with tempfile.TemporaryDirectory() as tmp_dir: with infra.network.network( nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs, ) as network: network.start_and_join(args) primary, _ = network.find_primary() # First, retrieve and save one committed snapshot txs.issue(network, number_txs=args.snapshot_tx_interval) old_committed_snapshots = network.get_committed_snapshots(primary) copy( os.path.join(old_committed_snapshots, os.listdir(old_committed_snapshots)[0]), tmp_dir, ) # Then generate another newer snapshot, and add two more nodes from it txs.issue(network, number_txs=args.snapshot_tx_interval) for _ in range(0, 2): network.create_and_trust_node( args.package, "local://localhost", args, from_snapshot=True, ) # Kill primary and wait for a new one: new primary is # guaranteed to have started from the new snapshot primary.stop() network.wait_for_new_primary(primary) # Start new node from the old snapshot try: network.create_and_trust_node( args.package, "local://localhost", args, from_snapshot=True, snapshot_dir=tmp_dir, timeout=3, ) assert False, "Node should not be able to join from old snapshot" except infra.network.StartupSnapshotIsOld: pass
def test_add_node_from_backup(network, args): backup = network.find_any_backup() new_node = network.create_and_trust_node( args.package, "localhost", args, target_node=backup ) assert new_node return network
def test_add_node(network, args): new_node = network.create_and_trust_node(args.package, "localhost", args) with new_node.client() as c: s = c.get("/node/state") assert s.body.json()["id"] == new_node.node_id assert new_node return network
def test_update_all_nodes(network, args): primary, _ = network.find_nodes() first_code_id, new_code_id = [ get_code_id(args.oe_binary, infra.path.build_lib_path(pkg, args.enclave_type)) for pkg in [args.package, args.replacement_package] ] LOG.info("Add new code id") network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ {"digest": first_code_id, "status": "ALLOWED_TO_JOIN"}, {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"}, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info("Remove old code id") network.consortium.retire_code(primary, first_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"}, ], key=lambda x: x["digest"], ) assert versions == expected, versions old_nodes = network.nodes.copy() LOG.info("Start fresh nodes running new code") for _ in range(0, len(network.nodes)): new_node = network.create_and_trust_node( args.replacement_package, "local://localhost", args ) assert new_node LOG.info("Retire original nodes running old code") for node in old_nodes: primary, _ = network.find_nodes() network.consortium.retire_node(primary, node) # Elections take (much) longer than a backup removal which is just # a commit, so we need to adjust our timeout accordingly, hence this branch if node.node_id == primary.node_id: new_primary, new_term = network.wait_for_new_primary(primary.node_id) LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}") primary = new_primary network.nodes.remove(node) node.stop() LOG.info("Check the network is still functional") reconfiguration.check_can_progress(new_node) return network
def test_add_node_from_snapshot(network, args, copy_ledger_read_only=True): new_node = network.create_and_trust_node( args.package, "local://localhost", args, from_snapshot=True, copy_ledger_read_only=copy_ledger_read_only, ) assert new_node return network
def test_add_node(network, args): new_node = network.create_and_trust_node( args.package, "local://localhost", args, from_snapshot=False, ) with new_node.client() as c: s = c.get("/node/state") assert s.body.json()["node_id"] == new_node.node_id assert ( s.body.json()["startup_seqno"] == 0 ), "Node started without snapshot but reports startup seqno != 0" assert new_node return network
def test_add_node_from_snapshot(network, args, copy_ledger_read_only=True, from_backup=False): # Before adding the node from a snapshot, override at least one app entry # and wait for a new committed snapshot covering that entry, so that there # is at least one historical entry to verify. network.txs.issue(network, number_txs=1) for _ in range(1, args.snapshot_tx_interval): network.txs.issue(network, number_txs=1, repeat=True) last_tx = network.txs.get_last_tx(priv=True) if network.wait_for_snapshot_committed_for(seqno=last_tx[1]["seqno"]): break target_node = None snapshot_dir = None if from_backup: primary, target_node = network.find_primary_and_any_backup() # Retrieve snapshot from primary as only primary node # generates snapshots snapshot_dir = network.get_committed_snapshots(primary) new_node = network.create_and_trust_node( args.package, "local://localhost", args, copy_ledger_read_only=copy_ledger_read_only, target_node=target_node, snapshot_dir=snapshot_dir, ) assert new_node if copy_ledger_read_only: with new_node.client() as c: r = c.get("/node/state") assert ( r.body.json()["startup_seqno"] != 0), "Node started from snapshot but reports startup seqno of 0" # Finally, verify all app entries on the new node, including historical ones network.txs.verify(node=new_node) return network
def test_add_node_from_snapshot( network, args, copy_ledger_read_only=True, from_backup=False ): target_node = None snapshot_dir = None if from_backup: primary, target_node = network.find_primary_and_any_backup() # Retrieve snapshot from primary as only primary node # generates snapshots snapshot_dir = network.get_committed_snapshots(primary) new_node = network.create_and_trust_node( args.package, "local://localhost", args, copy_ledger_read_only=copy_ledger_read_only, target_node=target_node, snapshot_dir=snapshot_dir, ) assert new_node return network
def test_node_replacement(network, args): primary, backups = network.find_nodes() nodes = network.get_joined_nodes() node_to_replace = backups[-1] f = infra.e2e_args.max_f(args, len(nodes)) f_backups = backups[:f] # Retire one node network.consortium.retire_node(primary, node_to_replace) node_to_replace.stop() network.nodes.remove(node_to_replace) check_can_progress(primary) # Add in a node using the same address replacement_node = network.create_and_trust_node( args.package, f"local://{node_to_replace.host}:{node_to_replace.rpc_port}", args, node_port=node_to_replace.node_port, from_snapshot=False, ) assert replacement_node.node_id != node_to_replace.node_id assert replacement_node.host == node_to_replace.host assert replacement_node.node_port == node_to_replace.node_port assert replacement_node.rpc_port == node_to_replace.rpc_port LOG.info( f"Stopping {len(f_backups)} other nodes to make progress depend on the replacement" ) for other_backup in f_backups: other_backup.suspend() # Confirm the network can make progress check_can_progress(primary) for other_backup in f_backups: other_backup.resume() return network
def run(args): hosts = ["localhost", "localhost", "localhost"] LOG.info(f"setting seed to {args.seed}") random.seed(args.seed) txs = app.LoggingTxs() with infra.network.network( hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs ) as network: network.start_and_join(args) original_nodes = network.get_joined_nodes() view_info = {} suspend.update_view_info(network, view_info) app.test_run_txs(network=network, args=args, num_txs=TOTAL_REQUESTS) suspend.update_view_info(network, view_info) nodes_to_kill = [network.find_any_backup()] nodes_to_keep = [n for n in original_nodes if n not in nodes_to_kill] # check that a new node can catch up after all the requests late_joiner = network.create_and_trust_node(args.package, "localhost", args) nodes_to_keep.append(late_joiner) # some requests to be processed while the late joiner catches up # (no strict checking that these requests are actually being processed simultaneously with the node catchup) app.test_run_txs( network=network, args=args, num_txs=int(TOTAL_REQUESTS / 2), nodes=original_nodes, # doesn't contain late joiner verify=False, # will try to verify for late joiner and it might not be ready yet ) suspend.wait_for_late_joiner(original_nodes[0], late_joiner) # kill the old node(s) and ensure we are still making progress for backup_to_retire in nodes_to_kill: LOG.success(f"Stopping node {backup_to_retire.node_id}") backup_to_retire.stop() # check nodes are ok after we killed one off app.test_run_txs( network=network, args=args, nodes=nodes_to_keep, num_txs=len(nodes_to_keep), timeout=30, ignore_failures=True, # in the event of an early view change due to the late joiner this might # take longer than usual to complete and we don't want the test to break here ) suspend.test_suspend_nodes(network, args, nodes_to_keep) # run txs while nodes get suspended app.test_run_txs( network=network, args=args, num_txs=4 * TOTAL_REQUESTS, timeout=30, ignore_failures=True, # in the event of an early view change due to the late joiner this might # take longer than usual to complete and we don't want the test to break here ) suspend.update_view_info(network, view_info) # check nodes have resumed normal execution before shutting down app.test_run_txs(network=network, args=args, num_txs=len(nodes_to_keep)) # we have asserted that all nodes are caught up # assert that view changes actually did occur assert len(view_info) > 1 LOG.success("----------- views and primaries recorded -----------") for view, primary in view_info.items(): LOG.success(f"view {view} - primary {primary}")
def run(args): hosts = ["localhost", "localhost"] with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb) as network: network.start_and_join(args) primary, _ = network.find_nodes() first_code_id = get_code_id( infra.path.build_lib_path(args.package, args.enclave_type)) with primary.client() as uc: r = uc.get("/node/code") assert r.body.json() == { "versions": [{ "digest": first_code_id, "status": "ACCEPTED" }], }, r.body LOG.info("Adding a new node") new_node = network.create_and_trust_node(args.package, "localhost", args) assert new_node new_code_id = get_code_id( infra.path.build_lib_path(args.patched_file_name, args.enclave_type)) LOG.info(f"Adding a node with unsupported code id {new_code_id}") code_not_found_exception = None try: network.create_and_add_pending_node(args.patched_file_name, "localhost", args, timeout=3) except infra.network.CodeIdNotFound as err: code_not_found_exception = err assert ( code_not_found_exception is not None ), f"Adding a node with unsupported code id {new_code_id} should fail" # Slow quote verification means that any attempt to add a node may cause an election, so confirm primary after adding node primary, _ = network.find_primary() network.consortium.add_new_code(primary, new_code_id) with primary.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "ACCEPTED" }, { "digest": new_code_id, "status": "ACCEPTED" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions new_nodes = set() old_nodes_count = len(network.nodes) new_nodes_count = old_nodes_count + 1 LOG.info( f"Adding more new nodes ({new_nodes_count}) than originally existed ({old_nodes_count})" ) for _ in range(0, new_nodes_count): new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node new_nodes.add(new_node) LOG.info("Stopping all original nodes") old_nodes = set(network.nodes).difference(new_nodes) for node in old_nodes: LOG.debug(f"Stopping old node {node.node_id}") node.stop() new_primary, _ = network.wait_for_new_primary(primary.node_id) LOG.info(f"New_primary is {new_primary.node_id}") LOG.info("Adding another node to the network") new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node network.wait_for_node_commit_sync(args.consensus) LOG.info("Remove first code id") network.consortium.retire_code(new_node, first_code_id) with new_node.client() as uc: r = uc.get("/node/code") versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"]) expected = sorted( [ { "digest": first_code_id, "status": "RETIRED" }, { "digest": new_code_id, "status": "ACCEPTED" }, ], key=lambda x: x["digest"], ) assert versions == expected, versions LOG.info(f"Adding a node with retired code id {first_code_id}") code_not_found_exception = None try: network.create_and_add_pending_node(args.package, "localhost", args, timeout=3) except infra.network.CodeIdRetired as err: code_not_found_exception = err assert ( code_not_found_exception is not None ), f"Adding a node with unsupported code id {new_code_id} should fail" LOG.info("Adding another node with the new code to the network") new_node = network.create_and_trust_node(args.patched_file_name, "localhost", args) assert new_node network.wait_for_node_commit_sync(args.consensus)
def test_add_node_from_snapshot(network, args): new_node = network.create_and_trust_node( args.package, "localhost", args, from_snapshot=True ) assert new_node return network
def run(args): hosts = ["localhost", "localhost", "localhost"] LOG.info(f"setting seed to {args.seed}") random.seed(args.seed) txs = app.LoggingTxs() with infra.network.network(hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs) as network: network.start_and_join(args) original_nodes = network.get_joined_nodes() view_info = {} suspend.update_view_info(network, view_info) app.test_run_txs(network=network, args=args, num_txs=TOTAL_REQUESTS) suspend.test_suspend_nodes(network, args) # run txs while nodes get suspended app.test_run_txs( network=network, args=args, num_txs=4 * TOTAL_REQUESTS, ignore_failures=True, ) suspend.update_view_info(network, view_info) late_joiner = network.create_and_trust_node(args.package, "localhost", args) # some requests to be processed while the late joiner catches up # (no strict checking that these requests are actually being processed simultaneously with the node catchup) app.test_run_txs( network=network, args=args, num_txs=int(TOTAL_REQUESTS / 2), nodes=original_nodes, # doesn't contain late joiner verify= False, # will try to verify for late joiner and it might not be ready yet ) caught_up = suspend.wait_for_late_joiner(original_nodes[0], late_joiner) if caught_up == suspend.LateJoinerStatus.Stuck: # should be removed when node configuration has been implemented to allow # a late joiner to force a view change LOG.warning( "late joiner is stuck, stop trying if catchup fails again") suspend.wait_for_late_joiner(original_nodes[0], late_joiner, True) elif caught_up == suspend.LateJoinerStatus.NotReady: while caught_up == suspend.LateJoinerStatus.NotReady: LOG.warning("late joiner is not ready to accept RPC's yet") caught_up = suspend.wait_for_late_joiner( original_nodes[0], late_joiner) elif caught_up == suspend.LateJoinerStatus.Ready: LOG.success("late joiner caught up successfully") # check nodes have resumed normal execution before shutting down app.test_run_txs( network=network, args=args, num_txs=len(network.get_joined_nodes()), timeout=30, ignore_failures=True, ) # assert that view changes actually did occur assert len(view_info) > 1 LOG.success("----------- views and primaries recorded -----------") for view, primary in view_info.items(): LOG.success(f"view {view} - primary {primary}")
def test_add_node(network, args): new_node = network.create_and_trust_node(args.package, "localhost", args) assert new_node return network