Example #1
0
def run_join_old_snapshot(args):
    txs = app.LoggingTxs()
    nodes = ["local://localhost"]

    with tempfile.TemporaryDirectory() as tmp_dir:

        with infra.network.network(
                nodes,
                args.binary_dir,
                args.debug_nodes,
                args.perf_nodes,
                pdb=args.pdb,
                txs=txs,
        ) as network:
            network.start_and_join(args)
            primary, _ = network.find_primary()

            # First, retrieve and save one committed snapshot
            txs.issue(network, number_txs=args.snapshot_tx_interval)
            old_committed_snapshots = network.get_committed_snapshots(primary)
            copy(
                os.path.join(old_committed_snapshots,
                             os.listdir(old_committed_snapshots)[0]),
                tmp_dir,
            )

            # Then generate another newer snapshot, and add two more nodes from it
            txs.issue(network, number_txs=args.snapshot_tx_interval)

            for _ in range(0, 2):
                network.create_and_trust_node(
                    args.package,
                    "local://localhost",
                    args,
                    from_snapshot=True,
                )

            # Kill primary and wait for a new one: new primary is
            # guaranteed to have started from the new snapshot
            primary.stop()
            network.wait_for_new_primary(primary)

            # Start new node from the old snapshot
            try:
                network.create_and_trust_node(
                    args.package,
                    "local://localhost",
                    args,
                    from_snapshot=True,
                    snapshot_dir=tmp_dir,
                    timeout=3,
                )
                assert False, "Node should not be able to join from old snapshot"
            except infra.network.StartupSnapshotIsOld:
                pass
Example #2
0
def test_add_node_from_backup(network, args):
    backup = network.find_any_backup()
    new_node = network.create_and_trust_node(
        args.package, "localhost", args, target_node=backup
    )
    assert new_node
    return network
Example #3
0
def test_add_node(network, args):
    new_node = network.create_and_trust_node(args.package, "localhost", args)
    with new_node.client() as c:
        s = c.get("/node/state")
        assert s.body.json()["id"] == new_node.node_id
    assert new_node
    return network
Example #4
0
def test_update_all_nodes(network, args):
    primary, _ = network.find_nodes()

    first_code_id, new_code_id = [
        get_code_id(args.oe_binary, infra.path.build_lib_path(pkg, args.enclave_type))
        for pkg in [args.package, args.replacement_package]
    ]

    LOG.info("Add new code id")
    network.consortium.add_new_code(primary, new_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {"digest": first_code_id, "status": "ALLOWED_TO_JOIN"},
                {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"},
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    LOG.info("Remove old code id")
    network.consortium.retire_code(primary, first_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"},
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    old_nodes = network.nodes.copy()

    LOG.info("Start fresh nodes running new code")
    for _ in range(0, len(network.nodes)):
        new_node = network.create_and_trust_node(
            args.replacement_package, "local://localhost", args
        )
        assert new_node

    LOG.info("Retire original nodes running old code")
    for node in old_nodes:
        primary, _ = network.find_nodes()
        network.consortium.retire_node(primary, node)
        # Elections take (much) longer than a backup removal which is just
        # a commit, so we need to adjust our timeout accordingly, hence this branch
        if node.node_id == primary.node_id:
            new_primary, new_term = network.wait_for_new_primary(primary.node_id)
            LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")
            primary = new_primary
        network.nodes.remove(node)
        node.stop()

    LOG.info("Check the network is still functional")
    reconfiguration.check_can_progress(new_node)
    return network
Example #5
0
def test_add_node_from_snapshot(network, args, copy_ledger_read_only=True):
    new_node = network.create_and_trust_node(
        args.package,
        "local://localhost",
        args,
        from_snapshot=True,
        copy_ledger_read_only=copy_ledger_read_only,
    )
    assert new_node
    return network
Example #6
0
def test_add_node(network, args):
    new_node = network.create_and_trust_node(
        args.package,
        "local://localhost",
        args,
        from_snapshot=False,
    )
    with new_node.client() as c:
        s = c.get("/node/state")
        assert s.body.json()["node_id"] == new_node.node_id
        assert (
            s.body.json()["startup_seqno"] == 0
        ), "Node started without snapshot but reports startup seqno != 0"
    assert new_node
    return network
Example #7
0
def test_add_node_from_snapshot(network,
                                args,
                                copy_ledger_read_only=True,
                                from_backup=False):
    # Before adding the node from a snapshot, override at least one app entry
    # and wait for a new committed snapshot covering that entry, so that there
    # is at least one historical entry to verify.
    network.txs.issue(network, number_txs=1)
    for _ in range(1, args.snapshot_tx_interval):
        network.txs.issue(network, number_txs=1, repeat=True)
        last_tx = network.txs.get_last_tx(priv=True)
        if network.wait_for_snapshot_committed_for(seqno=last_tx[1]["seqno"]):
            break

    target_node = None
    snapshot_dir = None
    if from_backup:
        primary, target_node = network.find_primary_and_any_backup()
        # Retrieve snapshot from primary as only primary node
        # generates snapshots
        snapshot_dir = network.get_committed_snapshots(primary)

    new_node = network.create_and_trust_node(
        args.package,
        "local://localhost",
        args,
        copy_ledger_read_only=copy_ledger_read_only,
        target_node=target_node,
        snapshot_dir=snapshot_dir,
    )
    assert new_node

    if copy_ledger_read_only:
        with new_node.client() as c:
            r = c.get("/node/state")
            assert (
                r.body.json()["startup_seqno"] !=
                0), "Node started from snapshot but reports startup seqno of 0"

    # Finally, verify all app entries on the new node, including historical ones
    network.txs.verify(node=new_node)

    return network
Example #8
0
def test_add_node_from_snapshot(
    network, args, copy_ledger_read_only=True, from_backup=False
):
    target_node = None
    snapshot_dir = None
    if from_backup:
        primary, target_node = network.find_primary_and_any_backup()
        # Retrieve snapshot from primary as only primary node
        # generates snapshots
        snapshot_dir = network.get_committed_snapshots(primary)

    new_node = network.create_and_trust_node(
        args.package,
        "local://localhost",
        args,
        copy_ledger_read_only=copy_ledger_read_only,
        target_node=target_node,
        snapshot_dir=snapshot_dir,
    )
    assert new_node
    return network
Example #9
0
def test_node_replacement(network, args):
    primary, backups = network.find_nodes()

    nodes = network.get_joined_nodes()
    node_to_replace = backups[-1]
    f = infra.e2e_args.max_f(args, len(nodes))
    f_backups = backups[:f]

    # Retire one node
    network.consortium.retire_node(primary, node_to_replace)
    node_to_replace.stop()
    network.nodes.remove(node_to_replace)
    check_can_progress(primary)

    # Add in a node using the same address
    replacement_node = network.create_and_trust_node(
        args.package,
        f"local://{node_to_replace.host}:{node_to_replace.rpc_port}",
        args,
        node_port=node_to_replace.node_port,
        from_snapshot=False,
    )

    assert replacement_node.node_id != node_to_replace.node_id
    assert replacement_node.host == node_to_replace.host
    assert replacement_node.node_port == node_to_replace.node_port
    assert replacement_node.rpc_port == node_to_replace.rpc_port
    LOG.info(
        f"Stopping {len(f_backups)} other nodes to make progress depend on the replacement"
    )
    for other_backup in f_backups:
        other_backup.suspend()
    # Confirm the network can make progress
    check_can_progress(primary)
    for other_backup in f_backups:
        other_backup.resume()

    return network
Example #10
0
def run(args):
    hosts = ["localhost", "localhost", "localhost"]

    LOG.info(f"setting seed to {args.seed}")
    random.seed(args.seed)
    txs = app.LoggingTxs()

    with infra.network.network(
        hosts, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb, txs=txs
    ) as network:
        network.start_and_join(args)
        original_nodes = network.get_joined_nodes()
        view_info = {}

        suspend.update_view_info(network, view_info)
        app.test_run_txs(network=network, args=args, num_txs=TOTAL_REQUESTS)
        suspend.update_view_info(network, view_info)

        nodes_to_kill = [network.find_any_backup()]
        nodes_to_keep = [n for n in original_nodes if n not in nodes_to_kill]

        # check that a new node can catch up after all the requests
        late_joiner = network.create_and_trust_node(args.package, "localhost", args)
        nodes_to_keep.append(late_joiner)

        # some requests to be processed while the late joiner catches up
        # (no strict checking that these requests are actually being processed simultaneously with the node catchup)
        app.test_run_txs(
            network=network,
            args=args,
            num_txs=int(TOTAL_REQUESTS / 2),
            nodes=original_nodes,  # doesn't contain late joiner
            verify=False,  # will try to verify for late joiner and it might not be ready yet
        )

        suspend.wait_for_late_joiner(original_nodes[0], late_joiner)

        # kill the old node(s) and ensure we are still making progress
        for backup_to_retire in nodes_to_kill:
            LOG.success(f"Stopping node {backup_to_retire.node_id}")
            backup_to_retire.stop()

        # check nodes are ok after we killed one off
        app.test_run_txs(
            network=network,
            args=args,
            nodes=nodes_to_keep,
            num_txs=len(nodes_to_keep),
            timeout=30,
            ignore_failures=True,
            # in the event of an early view change due to the late joiner this might
            # take longer than usual to complete and we don't want the test to break here
        )

        suspend.test_suspend_nodes(network, args, nodes_to_keep)

        # run txs while nodes get suspended
        app.test_run_txs(
            network=network,
            args=args,
            num_txs=4 * TOTAL_REQUESTS,
            timeout=30,
            ignore_failures=True,
            # in the event of an early view change due to the late joiner this might
            # take longer than usual to complete and we don't want the test to break here
        )

        suspend.update_view_info(network, view_info)

        # check nodes have resumed normal execution before shutting down
        app.test_run_txs(network=network, args=args, num_txs=len(nodes_to_keep))

        # we have asserted that all nodes are caught up
        # assert that view changes actually did occur
        assert len(view_info) > 1

        LOG.success("----------- views and primaries recorded -----------")
        for view, primary in view_info.items():
            LOG.success(f"view {view} - primary {primary}")
Example #11
0
def run(args):
    hosts = ["localhost", "localhost"]

    with infra.network.network(hosts,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_join(args)
        primary, _ = network.find_nodes()

        first_code_id = get_code_id(
            infra.path.build_lib_path(args.package, args.enclave_type))

        with primary.client() as uc:
            r = uc.get("/node/code")
            assert r.body.json() == {
                "versions": [{
                    "digest": first_code_id,
                    "status": "ACCEPTED"
                }],
            }, r.body

        LOG.info("Adding a new node")
        new_node = network.create_and_trust_node(args.package, "localhost",
                                                 args)
        assert new_node

        new_code_id = get_code_id(
            infra.path.build_lib_path(args.patched_file_name,
                                      args.enclave_type))

        LOG.info(f"Adding a node with unsupported code id {new_code_id}")
        code_not_found_exception = None
        try:
            network.create_and_add_pending_node(args.patched_file_name,
                                                "localhost",
                                                args,
                                                timeout=3)
        except infra.network.CodeIdNotFound as err:
            code_not_found_exception = err

        assert (
            code_not_found_exception is not None
        ), f"Adding a node with unsupported code id {new_code_id} should fail"

        # Slow quote verification means that any attempt to add a node may cause an election, so confirm primary after adding node
        primary, _ = network.find_primary()

        network.consortium.add_new_code(primary, new_code_id)

        with primary.client() as uc:
            r = uc.get("/node/code")
            versions = sorted(r.body.json()["versions"],
                              key=lambda x: x["digest"])
            expected = sorted(
                [
                    {
                        "digest": first_code_id,
                        "status": "ACCEPTED"
                    },
                    {
                        "digest": new_code_id,
                        "status": "ACCEPTED"
                    },
                ],
                key=lambda x: x["digest"],
            )
            assert versions == expected, versions

        new_nodes = set()
        old_nodes_count = len(network.nodes)
        new_nodes_count = old_nodes_count + 1

        LOG.info(
            f"Adding more new nodes ({new_nodes_count}) than originally existed ({old_nodes_count})"
        )
        for _ in range(0, new_nodes_count):
            new_node = network.create_and_trust_node(args.patched_file_name,
                                                     "localhost", args)
            assert new_node
            new_nodes.add(new_node)

        LOG.info("Stopping all original nodes")
        old_nodes = set(network.nodes).difference(new_nodes)
        for node in old_nodes:
            LOG.debug(f"Stopping old node {node.node_id}")
            node.stop()

        new_primary, _ = network.wait_for_new_primary(primary.node_id)
        LOG.info(f"New_primary is {new_primary.node_id}")

        LOG.info("Adding another node to the network")
        new_node = network.create_and_trust_node(args.patched_file_name,
                                                 "localhost", args)
        assert new_node
        network.wait_for_node_commit_sync(args.consensus)

        LOG.info("Remove first code id")
        network.consortium.retire_code(new_node, first_code_id)

        with new_node.client() as uc:
            r = uc.get("/node/code")
            versions = sorted(r.body.json()["versions"],
                              key=lambda x: x["digest"])
            expected = sorted(
                [
                    {
                        "digest": first_code_id,
                        "status": "RETIRED"
                    },
                    {
                        "digest": new_code_id,
                        "status": "ACCEPTED"
                    },
                ],
                key=lambda x: x["digest"],
            )
            assert versions == expected, versions

        LOG.info(f"Adding a node with retired code id {first_code_id}")
        code_not_found_exception = None
        try:
            network.create_and_add_pending_node(args.package,
                                                "localhost",
                                                args,
                                                timeout=3)
        except infra.network.CodeIdRetired as err:
            code_not_found_exception = err

        assert (
            code_not_found_exception is not None
        ), f"Adding a node with unsupported code id {new_code_id} should fail"

        LOG.info("Adding another node with the new code to the network")
        new_node = network.create_and_trust_node(args.patched_file_name,
                                                 "localhost", args)
        assert new_node
        network.wait_for_node_commit_sync(args.consensus)
Example #12
0
def test_add_node_from_snapshot(network, args):
    new_node = network.create_and_trust_node(
        args.package, "localhost", args, from_snapshot=True
    )
    assert new_node
    return network
Example #13
0
def run(args):
    hosts = ["localhost", "localhost", "localhost"]

    LOG.info(f"setting seed to {args.seed}")
    random.seed(args.seed)
    txs = app.LoggingTxs()

    with infra.network.network(hosts,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb,
                               txs=txs) as network:
        network.start_and_join(args)
        original_nodes = network.get_joined_nodes()
        view_info = {}

        suspend.update_view_info(network, view_info)
        app.test_run_txs(network=network, args=args, num_txs=TOTAL_REQUESTS)
        suspend.test_suspend_nodes(network, args)

        # run txs while nodes get suspended
        app.test_run_txs(
            network=network,
            args=args,
            num_txs=4 * TOTAL_REQUESTS,
            ignore_failures=True,
        )
        suspend.update_view_info(network, view_info)
        late_joiner = network.create_and_trust_node(args.package, "localhost",
                                                    args)

        # some requests to be processed while the late joiner catches up
        # (no strict checking that these requests are actually being processed simultaneously with the node catchup)
        app.test_run_txs(
            network=network,
            args=args,
            num_txs=int(TOTAL_REQUESTS / 2),
            nodes=original_nodes,  # doesn't contain late joiner
            verify=
            False,  # will try to verify for late joiner and it might not be ready yet
        )

        caught_up = suspend.wait_for_late_joiner(original_nodes[0],
                                                 late_joiner)
        if caught_up == suspend.LateJoinerStatus.Stuck:
            # should be removed when node configuration has been implemented to allow
            # a late joiner to force a view change
            LOG.warning(
                "late joiner is stuck, stop trying if catchup fails again")
            suspend.wait_for_late_joiner(original_nodes[0], late_joiner, True)
        elif caught_up == suspend.LateJoinerStatus.NotReady:
            while caught_up == suspend.LateJoinerStatus.NotReady:
                LOG.warning("late joiner is not ready to accept RPC's yet")
                caught_up = suspend.wait_for_late_joiner(
                    original_nodes[0], late_joiner)
        elif caught_up == suspend.LateJoinerStatus.Ready:
            LOG.success("late joiner caught up successfully")

        # check nodes have resumed normal execution before shutting down
        app.test_run_txs(
            network=network,
            args=args,
            num_txs=len(network.get_joined_nodes()),
            timeout=30,
            ignore_failures=True,
        )

        # assert that view changes actually did occur
        assert len(view_info) > 1

        LOG.success("----------- views and primaries recorded -----------")
        for view, primary in view_info.items():
            LOG.success(f"view {view} - primary {primary}")
Example #14
0
def test_add_node(network, args):
    new_node = network.create_and_trust_node(args.package, "localhost", args)
    assert new_node
    return network