Esempio n. 1
0
def test_partition_majority(network, args):
    primary, backups = network.find_nodes()

    # Create a partition with primary + half remaining nodes (i.e. majority)
    partition = [primary]
    partition.extend(backups[len(backups) // 2:])

    # Wait for all nodes to be have reached the same level of commit, so that
    # nodes outside of partition can become primary after this one is dropped
    network.wait_for_all_nodes_to_commit(primary=primary)

    # The primary should remain stable while the partition is active
    # Note: Context manager
    initial_view = None
    with network.partitioner.partition(partition):
        try:
            network.wait_for_new_primary(primary)
            assert False, "No new primary should be elected when partitioning majority"
        except TimeoutError:
            LOG.info("No new primary, as expected")
            with primary.client() as c:
                res = c.get("/node/network")  # Well-known read-only endpoint
                body = res.body.json()
                initial_view = body["current_view"]

    # The partitioned nodes will have called elections, increasing their view.
    # When the partition is lifted, the nodes must elect a new leader, in at least this
    # increased term. The winning node could come from either partition, and could even
    # be the original primary.
    network.wait_for_primary_unanimity(min_view=initial_view)

    return network
Esempio n. 2
0
def run(args):
    args.jwt_key_refresh_interval_s = 1

    with infra.network.network(args.nodes,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_join(args)
        network = test_jwt_without_key_policy(network, args)
        network = test_jwt_with_sgx_key_policy(network, args)
        network = test_jwt_with_sgx_key_filter(network, args)
        network = test_jwt_key_auto_refresh(network, args)

        # Check that auto refresh also works on backups
        primary, _ = network.find_primary()
        primary.stop()
        network.wait_for_new_primary(primary.node_id)
        network = test_jwt_key_auto_refresh(network, args)

    args.jwt_key_refresh_interval_s = 100000
    with infra.network.network(args.nodes,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_join(args)
        network = test_jwt_key_initial_refresh(network, args)

        # Check that initial refresh also works on backups
        primary, _ = network.find_primary()
        primary.stop()
        network.wait_for_new_primary(primary.node_id)
        network = test_jwt_key_initial_refresh(network, args)
Esempio n. 3
0
def test_isolate_primary_from_one_backup(network, args):
    primary, backups = network.find_nodes()

    # Issue one transaction, waiting for all nodes to be have reached
    # the same level of commit, so that nodes outside of partition can
    # become primary after this one is dropped
    # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to
    # issue a write transaction instead of just reading the TxID of the latest entry
    network.txs.issue(network)

    # Isolate first backup from primary so that first backup becomes candidate
    # in a new term and wins the election
    # Note: Managed manually
    rules = network.partitioner.isolate_node(primary, backups[0])

    new_primary, new_view = network.wait_for_new_primary(primary,
                                                         nodes=backups,
                                                         timeout_multiplier=6)

    # Explicitly drop rules before continuing
    rules.drop()

    # Old primary should now report of the new primary
    new_primary_, new_view_ = network.wait_for_new_primary(primary,
                                                           nodes=[primary])
    assert (
        new_primary == new_primary_
    ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}"
    assert (
        new_view == new_view_
    ), f"Consensus view {new_view} should not changed after partition is dropped: no {new_view_}"

    return network
Esempio n. 4
0
def run_join_old_snapshot(args):
    txs = app.LoggingTxs("user0")
    nodes = ["local://localhost"]

    with tempfile.TemporaryDirectory() as tmp_dir:

        with infra.network.network(
            nodes,
            args.binary_dir,
            args.debug_nodes,
            args.perf_nodes,
            pdb=args.pdb,
            txs=txs,
        ) as network:
            network.start_and_open(args)
            primary, _ = network.find_primary()

            # First, retrieve and save one committed snapshot
            txs.issue(network, number_txs=args.snapshot_tx_interval)
            old_committed_snapshots = network.get_committed_snapshots(primary)
            copy(
                os.path.join(
                    old_committed_snapshots, os.listdir(old_committed_snapshots)[0]
                ),
                tmp_dir,
            )

            # Then generate another newer snapshot, and add two more nodes from it
            txs.issue(network, number_txs=args.snapshot_tx_interval)

            for _ in range(0, 2):
                new_node = network.create_node("local://localhost")
                network.join_node(
                    new_node,
                    args.package,
                    args,
                    from_snapshot=True,
                )
                network.trust_node(new_node, args)

            # Kill primary and wait for a new one: new primary is
            # guaranteed to have started from the new snapshot
            primary.stop()
            network.wait_for_new_primary(primary)

            # Start new node from the old snapshot
            try:
                new_node = network.create_node("local://localhost")
                network.join_node(
                    new_node,
                    args.package,
                    args,
                    from_snapshot=True,
                    snapshots_dir=tmp_dir,
                    timeout=3,
                )
            except infra.network.StartupSnapshotIsOld:
                pass
Esempio n. 5
0
def run_manual(args):
    with infra.network.network(
        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb
    ) as network:
        network.start_and_join(args)
        test_jwt_key_initial_refresh(network, args)

        # Check that initial refresh also works on backups
        primary, _ = network.find_primary()
        primary.stop()
        network.wait_for_new_primary(primary)
        test_jwt_key_initial_refresh(network, args)
Esempio n. 6
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.consortium.retire_node(primary, primary)
    network.wait_for_new_primary(primary)
    check_can_progress(backup)
    network.nodes.remove(primary)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    return network
Esempio n. 7
0
def test_new_joiner_helps_liveness(network, args):
    primary, backups = network.find_nodes()

    # Issue some transactions, so there is a ledger history that a new node must receive
    network.txs.issue(network, number_txs=10)

    # Remove a node, leaving the network frail
    network.retire_node(primary, backups[-1])
    backups[-1].stop()

    primary, backups = network.find_nodes()

    with contextlib.ExitStack() as stack:
        # Add a new node, but partition them before trusting them
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, args.package, args, from_snapshot=False)
        new_joiner_partition = [new_node]
        new_joiner_rules = stack.enter_context(
            network.partitioner.partition([primary, *backups],
                                          new_joiner_partition))

        # Trust the new node, and wait for commit of this (but don't ask the new node itself, which doesn't know this yet)
        network.trust_node(new_node, args, no_wait=True)
        check_can_progress(primary)

        # Partition the primary, temporarily creating a minority service that cannot make progress
        minority_partition = backups[len(backups) // 2:] + new_joiner_partition
        minority_rules = stack.enter_context(
            network.partitioner.partition(minority_partition))
        # This is an unusual situation, where we've actually produced a dead partitioned node.
        # Initially any write requests will timeout (failed attempt at forwarding), and then
        # the node transitions to a candidate with nobody to talk to. Rather than trying to
        # catch the errors of these states quickly, we just sleep until the latter state is
        # reached, and then confirm it was reached.
        time.sleep(network.observed_election_duration)
        with backups[0].client("user0") as c:
            r = c.post("/app/log/private", {"id": 42, "msg": "Hello world"})
            assert r.status_code == http.HTTPStatus.SERVICE_UNAVAILABLE

        # Restore the new node to the service
        new_joiner_rules.drop()

        # Confirm that the new node catches up, and progress can be made in this majority partition
        network.wait_for_new_primary(primary, minority_partition)
        check_can_progress(new_node)

        # Explicitly drop rules before continuing
        minority_rules.drop()

        network.wait_for_primary_unanimity()
        primary, _ = network.find_nodes()
        network.wait_for_all_nodes_to_commit(primary=primary)
Esempio n. 8
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.retire_node(primary, primary, timeout=15)
    # Query this backup to find the new primary. If we ask any other
    # node, then this backup may not know the new primary by the
    # time we call check_can_progress.
    network.wait_for_new_primary(primary, nodes=[backup])
    check_can_progress(backup)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    wait_for_reconfiguration_to_complete(network)
    return network
Esempio n. 9
0
def test_kill_primary(network, args):
    primary, _ = network.find_primary_and_any_backup()
    primary.stop()
    network.wait_for_new_primary(primary)

    # Verify that the TxID reported just after an election is valid
    # Note that the first TxID read after an election may be of a signature
    # Tx (time-based signature generation) in the new term rather than the
    # last entry in the previous term
    for node in network.get_joined_nodes():
        with node.client() as c:
            r = c.get("/node/network")
            c.wait_for_commit(r)

    return network
Esempio n. 10
0
def run_auto(args):
    with infra.network.network(
        args.nodes, args.binary_dir, args.debug_nodes, args.perf_nodes, pdb=args.pdb
    ) as network:
        network.start_and_join(args)
        test_jwt_without_key_policy(network, args)
        if args.enclave_type != "virtual":
            test_jwt_with_sgx_key_policy(network, args)
            test_jwt_with_sgx_key_filter(network, args)
        test_jwt_key_auto_refresh(network, args)

        # Check that auto refresh also works on backups
        primary, _ = network.find_primary()
        primary.stop()
        network.wait_for_new_primary(primary)
        test_jwt_key_auto_refresh(network, args)
Esempio n. 11
0
def test_kill_primary_no_reqs(network, args):
    old_primary, _ = network.find_primary_and_any_backup()
    old_primary.stop()
    new_primary, _ = network.wait_for_new_primary(old_primary)

    # Verify that the TxID reported just after an election is valid
    # Note that the first TxID read after an election may be of a signature
    # Tx (time-based signature generation) in the new term rather than the
    # last entry in the previous term
    for node in network.get_joined_nodes():
        with node.client() as c:
            r = c.get("/node/network")
            c.wait_for_commit(r)

            # Also verify that reported last ack time are as expected
            r = c.get("/node/consensus")
            acks = r.body.json()["details"]["acks"]
            for ack in acks.values():
                if node is new_primary:
                    assert (ack["last_received_ms"] <
                            network.args.election_timeout_ms), acks
                else:
                    assert (
                        ack["last_received_ms"] == 0
                    ), f"Backup {node.local_node_id} should report time of last acks of 0: {acks}"

    return network
Esempio n. 12
0
def test_update_all_nodes(network, args):
    primary, _ = network.find_nodes()

    first_code_id, new_code_id = [
        get_code_id(args.oe_binary, infra.path.build_lib_path(pkg, args.enclave_type))
        for pkg in [args.package, args.replacement_package]
    ]

    LOG.info("Add new code id")
    network.consortium.add_new_code(primary, new_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {"digest": first_code_id, "status": "ALLOWED_TO_JOIN"},
                {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"},
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    LOG.info("Remove old code id")
    network.consortium.retire_code(primary, first_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {"digest": new_code_id, "status": "ALLOWED_TO_JOIN"},
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    old_nodes = network.nodes.copy()

    LOG.info("Start fresh nodes running new code")
    for _ in range(0, len(network.nodes)):
        new_node = network.create_and_trust_node(
            args.replacement_package, "local://localhost", args
        )
        assert new_node

    LOG.info("Retire original nodes running old code")
    for node in old_nodes:
        primary, _ = network.find_nodes()
        network.consortium.retire_node(primary, node)
        # Elections take (much) longer than a backup removal which is just
        # a commit, so we need to adjust our timeout accordingly, hence this branch
        if node.node_id == primary.node_id:
            new_primary, new_term = network.wait_for_new_primary(primary.node_id)
            LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")
            primary = new_primary
        network.nodes.remove(node)
        node.stop()

    LOG.info("Check the network is still functional")
    reconfiguration.check_can_progress(new_node)
    return network
Esempio n. 13
0
def test_kill_primary(network, args):
    primary, _ = network.find_primary()
    primary.stop()
    new_primary, new_term = network.wait_for_new_primary(primary.node_id)
    LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")

    return network
Esempio n. 14
0
def test_kill_primary(network, args):
    primary, backup = network.find_primary_and_any_backup()
    primary.stop()

    # When the consensus is BFT there is no status message timer that triggers a new election.
    # It is triggered with a timeout from a message not executing. We need to send the message that
    # will not execute because of the stopped primary which will then trigger a view change
    if args.consensus == "bft":
        try:
            with backup.client("user0") as c:
                _ = c.post(
                    "/app/log/private",
                    {
                        "id": -1,
                        "msg": "This is submitted to force a view change",
                    },
                )
        except CCFConnectionException:
            LOG.warning(
                f"Could not successfully connect to node {backup.node_id}.")

    new_primary, new_term = network.wait_for_new_primary(primary.node_id)
    LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")

    return network
Esempio n. 15
0
def test_isolate_and_reconnect_primary(network, args, **kwargs):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        lost_tx_resp = check_does_not_progress(primary)

        new_primary, _ = network.wait_for_new_primary(primary,
                                                      nodes=backups,
                                                      timeout_multiplier=6)
        new_tx_resp = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        try:
            # There will be at least one full election cycle for nothing, where the
            # re-joining node fails to get elected but causes others to rev up their
            # term. After that, a successful election needs to take place, and we
            # arbitrarily allow 3 time periods to avoid being too brittle when
            # raft timeouts line up badly.
            c.wait_for_commit(new_tx_resp,
                              timeout=(network.election_duration * 4))
        except TimeoutError:
            details = c.get("/node/consensus").body.json()
            assert (
                False
            ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}"

        # Check it has dropped anything submitted while partitioned
        r = c.get(
            f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}"
        )
        status = TxStatus(r.body.json()["status"])
        assert status == TxStatus.Invalid, r
Esempio n. 16
0
def test_suspend_primary(network, args):
    primary, _ = network.find_primary()
    primary.suspend()
    new_primary, _ = network.wait_for_new_primary(primary)
    check_can_progress(new_primary)
    primary.resume()
    check_can_progress(new_primary)
    return network
Esempio n. 17
0
def test_join_straddling_primary_replacement(network, args):
    # We need a fourth node before we attempt the replacement, otherwise
    # we will reach a situation where two out four nodes in the voting quorum
    # are unable to participate (one retired and one not yet joined).
    test_add_node(network, args)
    primary, _ = network.find_primary()
    new_node = network.create_node("local://localhost")
    network.join_node(new_node, args.package, args)
    proposal_body = {
        "actions": [
            {
                "name": "transition_node_to_trusted",
                "args": {
                    "node_id": new_node.node_id,
                    "valid_from": str(datetime.now()),
                },
            },
            {
                "name": "remove_node",
                "args": {
                    "node_id": primary.node_id
                },
            },
        ]
    }

    proposal = network.consortium.get_any_active_member().propose(
        primary, proposal_body)
    network.consortium.vote_using_majority(
        primary,
        proposal,
        {
            "ballot":
            "export function vote (proposal, proposer_id) { return true }"
        },
        timeout=10,
    )

    network.wait_for_new_primary(primary)
    new_node.wait_for_node_to_join(timeout=10)

    primary.stop()
    network.nodes.remove(primary)
    wait_for_reconfiguration_to_complete(network)
    return network
Esempio n. 18
0
def test_suspend_primary(network, args):
    primary, _ = network.find_primary()
    primary.suspend()
    new_primary, new_term = network.wait_for_new_primary(primary.node_id)
    LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")
    reconfiguration.check_can_progress(new_primary)
    primary.resume()
    reconfiguration.check_can_progress(new_primary)
    return network
Esempio n. 19
0
def test_partition_majority(network, args):
    primary, backups = network.find_nodes()

    # Create a partition with primary + half remaining nodes (i.e. majority)
    partition = [primary]
    partition.extend(backups[len(backups) // 2 :])

    # Wait for all nodes to be have reached the same level of commit, so that
    # nodes outside of partition can become primary after this one is dropped
    network.wait_for_all_nodes_to_commit(primary=primary)

    # The primary should remain stable while the partition is active
    # Note: Context manager
    with network.partitioner.partition(partition):
        try:
            network.wait_for_new_primary(primary)
            assert False, "No new primary should be elected when partitioning majority"
        except TimeoutError:
            pass

    # A new leader should be elected once the partition is dropped
    network.wait_for_new_primary(primary)

    return network
Esempio n. 20
0
def run(args):
    # This is deliberately 5, because the rest of the test depends on this
    # to grow a prefix and allow just enough nodes to resume to reach the
    # desired election result. Conversion to a general f isn't trivial.
    hosts = ["local://localhost"] * 5

    with infra.network.network(hosts,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_join(args)
        primary, backups = network.find_nodes()

        # Suspend three of the backups to prevent commit
        backups[1].suspend()
        backups[2].suspend()
        backups[3].stop()

        txs = []
        # Run some transactions that can't be committed
        with primary.client("user0") as uc:
            for i in range(3):
                txs.append(
                    uc.post("/app/log/private", {
                        "id": 100 + i,
                        "msg": "Hello world"
                    }))

        sig_view, sig_seqno = txs[-1].view, txs[-1].seqno + 1
        with backups[0].client() as bc:
            wait_for_pending(bc, sig_view, sig_seqno)

        # Kill the primary, restore other backups
        primary.stop()
        backups[1].resume()
        backups[2].resume()
        new_primary, new_term = network.wait_for_new_primary(
            primary.node_id, timeout_multiplier=6)
        LOG.debug(f"New primary is {new_primary.node_id} in term {new_term}")

        # Check that uncommitted but committable suffix is preserved
        with new_primary.client("user0") as uc:
            check_commit = infra.checker.Checker(uc)
            for tx in txs:
                check_commit(tx)
Esempio n. 21
0
def test_isolate_primary_from_one_backup(network, args):
    p, backups = network.find_nodes()
    b_0, b_1 = backups

    # Issue one transaction, waiting for all nodes to be have reached
    # the same level of commit, so that nodes outside of partition can
    # become primary after this one is dropped
    # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to
    # issue a write transaction instead of just reading the TxID of the latest entry
    initial_txid = network.txs.issue(network)

    # Isolate first backup from primary so that first backup becomes candidate
    # in a new term and wins the election
    # Note: Managed manually
    rules = network.partitioner.isolate_node(p, b_0)

    # Now wait for several elections to occur. We expect:
    # - b_0 to call and win an election with b_1's help
    # - b_0 to produce a new signature, and commit it with b_1's help
    # - p to call its own election, and lose because it doesn't have this signature
    # - In the resulting election race:
    #   - If p calls first, it loses and we're in the same situation
    #   - If b_0 calls first, it wins, but then p calls its election and we've returned to the same situation
    #   - If b_1 calls first, it can win and then bring _both_ nodes up-to-date, becoming a _stable_ primary
    # So we repeat elections until b_1 is primary

    new_primary = network.wait_for_primary_unanimity(
        min_view=initial_txid.view, timeout_multiplier=30)
    assert new_primary == b_1

    new_view = network.txs.issue(network).view

    # The partition is now between 2 backups, but both can talk to the new primary
    # Explicitly drop rules before continuing
    rules.drop()

    # Original primary should now, or very soon, report the new primary
    new_primary_, new_view_ = network.wait_for_new_primary(p, nodes=[p])
    assert (
        new_primary == new_primary_
    ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}"
    assert (
        new_view == new_view_
    ), f"Consensus view {new_view} should not have changed after partition is dropped: now {new_view_}"

    return network
Esempio n. 22
0
def test_new_service(
    network,
    args,
    install_path,
    binary_dir,
    library_dir,
    version,
    cycle_existing_nodes=False,
):
    LOG.info("Update constitution")
    primary, _ = network.find_primary()
    new_constitution = get_new_constitution_for_install(args, install_path)
    network.consortium.set_constitution(primary, new_constitution)

    # Note: Changes to constitution between versions should be tested here

    LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]")
    nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else []
    nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1

    for _ in range(0, nodes_to_add_count):
        new_node = network.create_node(
            "local://localhost",
            binary_dir=binary_dir,
            library_dir=library_dir,
            version=version,
        )
        network.join_node(new_node, args.package, args)
        network.trust_node(new_node, args)
        new_node.verify_certificate_validity_period(
            expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS
        )

    for node in nodes_to_cycle:
        network.retire_node(primary, node)
        if primary == node:
            primary, _ = network.wait_for_new_primary(primary)
        node.stop()

    test_all_nodes_cert_renewal(network, args)

    LOG.info("Apply transactions to new nodes only")
    issue_activity_on_live_service(network, args)
    test_random_receipts(network, args, lts=True)
Esempio n. 23
0
def test_isolate_and_reconnect_primary(network, args):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        new_primary, _ = network.wait_for_new_primary(
            primary, nodes=backups, timeout_multiplier=6
        )
        new_tx = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        r = c.get("/node/commit")
        timeout = 5
        end_time = time.time() + timeout
        while time.time() < end_time:
            current_tx = TxID.from_str(
                c.get("/node/commit").body.json()["transaction_id"]
            )
            if current_tx.seqno >= new_tx.seqno:
                return network
            time.sleep(0.1)
        assert False, f"Stuck at {r}"
Esempio n. 24
0
def test_retire_primary(network, args):
    pre_count = count_nodes(node_configs(network), network)

    primary, backup = network.find_primary_and_any_backup()
    network.retire_node(primary, primary, timeout=15)
    # Query this backup to find the new primary. If we ask any other
    # node, then this backup may not know the new primary by the
    # time we call check_can_progress.
    new_primary, _ = network.wait_for_new_primary(primary, nodes=[backup])
    # The old primary should automatically be removed from the store
    # once a new primary is elected
    network.wait_for_node_in_store(
        new_primary,
        primary.node_id,
        node_status=None,
        timeout=3,
    )
    check_can_progress(backup)
    post_count = count_nodes(node_configs(network), network)
    assert pre_count == post_count + 1
    primary.stop()
    wait_for_reconfiguration_to_complete(network)
    return network
Esempio n. 25
0
def run(args):
    # This is deliberately 5, because the rest of the test depends on this
    # to grow a prefix and allow just enough nodes to resume to reach the
    # desired election result. Conversion to a general f isn't trivial.
    hosts = ["local://localhost"] * 5

    with infra.network.network(hosts,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_open(args)
        primary, backups = network.find_nodes()

        # Suspend three of the backups to prevent commit
        backups[1].suspend()
        backups[2].suspend()
        backups[3].stop()

        committable_txs = []
        # Run some transactions that can't be committed now
        with primary.client("user0") as uc:
            for i in range(3):
                committable_txs.append(
                    uc.post("/app/log/private", {
                        "id": 100 + i,
                        "msg": "Hello world"
                    }))

        last_tx = committable_txs[-1]
        sig_view, sig_seqno = last_tx.view, last_tx.seqno + 1
        with backups[0].client() as bc:
            wait_for_pending(bc, sig_view, sig_seqno)

        # Suspend the final backup and run some transactions which only the partitioned
        # primary hears, which should be discarded by the new primary
        # NB: We can't guarantee that these will be discarded. Since we can't control
        # what order the queued actions occur in after resuming, they may be appended
        # before an election is called. They key assertion is that this primary is able
        # to rejoin the network whatever happens, even when (in the usual case) they
        # hold a suffix which has been discarded.
        backups[0].suspend()
        post_partition_txs = []
        with primary.client("user0") as uc:
            for i in range(3):
                post_partition_txs.append(
                    uc.post("/app/log/private", {
                        "id": 100 + i,
                        "msg": "Hello world"
                    }))

        # Sleep long enough that this primary should be instantly replaced when nodes wake
        sleep_time = 2 * args.election_timeout_ms / 1000
        LOG.info(f"Sleeping {sleep_time}s")
        time.sleep(sleep_time)

        # Suspend the primary, resume other backups
        primary.suspend()
        backups[0].resume()
        backups[1].resume()
        backups[2].resume()
        new_primary, _ = network.wait_for_new_primary(primary,
                                                      timeout_multiplier=10)

        with new_primary.client("user0") as uc:
            # Check that uncommitted but committable suffix is preserved
            check_commit = infra.checker.Checker(uc)
            for tx in committable_txs:
                check_commit(tx)

        # Check that new transactions can be committed
        with new_primary.client("user0") as uc:
            for i in range(3):
                r = uc.post("/app/log/private", {
                    "id": 100 + i,
                    "msg": "Hello world"
                })
                assert r.status_code == 200
                uc.wait_for_commit(r)

        # Resume original primary, check that they rejoin correctly, including new transactions
        primary.resume()
        network.wait_for_node_commit_sync(timeout=16)
Esempio n. 26
0
def test_isolate_primary_from_one_backup(network, args):
    p, backups = network.find_nodes()
    b_0, b_1 = backups

    # Issue one transaction, waiting for all nodes to be have reached
    # the same level of commit, so that nodes outside of partition can
    # become primary after this one is dropped
    # Note: Because of https://github.com/microsoft/CCF/issues/2224, we need to
    # issue a write transaction instead of just reading the TxID of the latest entry
    initial_txid = network.txs.issue(network)

    # Isolate first backup from primary so that first backup becomes candidate
    # in a new term and wins the election
    # Note: Managed manually
    rules = network.partitioner.isolate_node(p, b_0)

    LOG.info(
        f"Check that primary {p.local_node_id} reports increasing last ack time for partitioned backup {b_0.local_node_id}"
    )
    last_ack = 0
    while True:
        with p.client() as c:
            r = c.get("/node/consensus", log_capture=[]).body.json()["details"]
            ack = r["acks"][b_0.node_id]["last_received_ms"]
        if r["primary_id"] is not None:
            assert (
                ack >= last_ack
            ), f"Nodes {p.local_node_id} and {b_0.local_node_id} are no longer partitioned"
            last_ack = ack
        else:
            LOG.debug(f"Node {p.local_node_id} is no longer primary")
            break
        time.sleep(0.1)

    # Now wait for several elections to occur. We expect:
    # - b_0 to call and win an election with b_1's help
    # - b_0 to produce a new signature, and commit it with b_1's help
    # - p to call its own election, and lose because it doesn't have this signature
    # - In the resulting election race:
    #   - If p calls first, it loses and we're in the same situation
    #   - If b_0 calls first, it wins, but then p calls its election and we've returned to the same situation
    #   - If b_1 calls first, it can win and then bring _both_ nodes up-to-date, becoming a _stable_ primary
    # So we repeat elections until b_1 is primary

    new_primary = network.wait_for_primary_unanimity(
        min_view=initial_txid.view, timeout_multiplier=30)
    assert new_primary == b_1

    new_view = network.txs.issue(network).view

    # The partition is now between 2 backups, but both can talk to the new primary
    # Explicitly drop rules before continuing
    rules.drop()

    LOG.info(
        f"Check that new primary {new_primary.local_node_id} reports stable acks"
    )
    last_ack = 0
    end_time = time.time() + 2 * network.args.election_timeout_ms // 1000
    while time.time() < end_time:
        with new_primary.client() as c:
            acks = c.get("/node/consensus",
                         log_capture=[]).body.json()["details"]["acks"]
            delayed_acks = [
                ack for ack in acks.values()
                if ack["last_received_ms"] > args.election_timeout_ms
            ]
            if delayed_acks:
                raise RuntimeError(
                    f"New primary reported some delayed acks: {acks}")
        time.sleep(0.1)

    # Original primary should now, or very soon, report the new primary
    new_primary_, new_view_ = network.wait_for_new_primary(p, nodes=[p])
    assert (
        new_primary == new_primary_
    ), f"New primary {new_primary_.local_node_id} after partition is dropped is different than before {new_primary.local_node_id}"
    assert (
        new_view == new_view_
    ), f"Consensus view {new_view} should not have changed after partition is dropped: now {new_view_}"

    return network
Esempio n. 27
0
def run(args):
    hosts = ["localhost", "localhost"]

    with infra.network.network(hosts,
                               args.binary_dir,
                               args.debug_nodes,
                               args.perf_nodes,
                               pdb=args.pdb) as network:
        network.start_and_join(args)
        primary, _ = network.find_nodes()

        first_code_id = get_code_id(
            infra.path.build_lib_path(args.package, args.enclave_type))

        with primary.client() as uc:
            r = uc.get("/node/code")
            assert r.body.json() == {
                "versions": [{
                    "digest": first_code_id,
                    "status": "ACCEPTED"
                }],
            }, r.body

        LOG.info("Adding a new node")
        new_node = network.create_and_trust_node(args.package, "localhost",
                                                 args)
        assert new_node

        new_code_id = get_code_id(
            infra.path.build_lib_path(args.patched_file_name,
                                      args.enclave_type))

        LOG.info(f"Adding a node with unsupported code id {new_code_id}")
        code_not_found_exception = None
        try:
            network.create_and_add_pending_node(args.patched_file_name,
                                                "localhost",
                                                args,
                                                timeout=3)
        except infra.network.CodeIdNotFound as err:
            code_not_found_exception = err

        assert (
            code_not_found_exception is not None
        ), f"Adding a node with unsupported code id {new_code_id} should fail"

        # Slow quote verification means that any attempt to add a node may cause an election, so confirm primary after adding node
        primary, _ = network.find_primary()

        network.consortium.add_new_code(primary, new_code_id)

        with primary.client() as uc:
            r = uc.get("/node/code")
            versions = sorted(r.body.json()["versions"],
                              key=lambda x: x["digest"])
            expected = sorted(
                [
                    {
                        "digest": first_code_id,
                        "status": "ACCEPTED"
                    },
                    {
                        "digest": new_code_id,
                        "status": "ACCEPTED"
                    },
                ],
                key=lambda x: x["digest"],
            )
            assert versions == expected, versions

        new_nodes = set()
        old_nodes_count = len(network.nodes)
        new_nodes_count = old_nodes_count + 1

        LOG.info(
            f"Adding more new nodes ({new_nodes_count}) than originally existed ({old_nodes_count})"
        )
        for _ in range(0, new_nodes_count):
            new_node = network.create_and_trust_node(args.patched_file_name,
                                                     "localhost", args)
            assert new_node
            new_nodes.add(new_node)

        LOG.info("Stopping all original nodes")
        old_nodes = set(network.nodes).difference(new_nodes)
        for node in old_nodes:
            LOG.debug(f"Stopping old node {node.node_id}")
            node.stop()

        new_primary, _ = network.wait_for_new_primary(primary.node_id)
        LOG.info(f"New_primary is {new_primary.node_id}")

        LOG.info("Adding another node to the network")
        new_node = network.create_and_trust_node(args.patched_file_name,
                                                 "localhost", args)
        assert new_node
        network.wait_for_node_commit_sync(args.consensus)

        LOG.info("Remove first code id")
        network.consortium.retire_code(new_node, first_code_id)

        with new_node.client() as uc:
            r = uc.get("/node/code")
            versions = sorted(r.body.json()["versions"],
                              key=lambda x: x["digest"])
            expected = sorted(
                [
                    {
                        "digest": first_code_id,
                        "status": "RETIRED"
                    },
                    {
                        "digest": new_code_id,
                        "status": "ACCEPTED"
                    },
                ],
                key=lambda x: x["digest"],
            )
            assert versions == expected, versions

        LOG.info(f"Adding a node with retired code id {first_code_id}")
        code_not_found_exception = None
        try:
            network.create_and_add_pending_node(args.package,
                                                "localhost",
                                                args,
                                                timeout=3)
        except infra.network.CodeIdRetired as err:
            code_not_found_exception = err

        assert (
            code_not_found_exception is not None
        ), f"Adding a node with unsupported code id {new_code_id} should fail"

        LOG.info("Adding another node with the new code to the network")
        new_node = network.create_and_trust_node(args.patched_file_name,
                                                 "localhost", args)
        assert new_node
        network.wait_for_node_commit_sync(args.consensus)
Esempio n. 28
0
def test_new_service(
    network,
    args,
    install_path,
    binary_dir,
    library_dir,
    version,
    cycle_existing_nodes=False,
):
    LOG.info("Update constitution")
    primary, _ = network.find_primary()
    new_constitution = get_new_constitution_for_install(args, install_path)
    network.consortium.set_constitution(primary, new_constitution)

    all_nodes = network.get_joined_nodes()

    # Note: Changes to constitution between versions should be tested here

    LOG.info(f"Add node to new service [cycle nodes: {cycle_existing_nodes}]")
    nodes_to_cycle = network.get_joined_nodes() if cycle_existing_nodes else []
    nodes_to_add_count = len(nodes_to_cycle) if cycle_existing_nodes else 1

    # Pre-2.0 nodes require X509 time format
    valid_from = str(infra.crypto.datetime_to_X509time(
        datetime.datetime.now()))

    for _ in range(0, nodes_to_add_count):
        new_node = network.create_node(
            "local://localhost",
            binary_dir=binary_dir,
            library_dir=library_dir,
            version=version,
        )
        network.join_node(new_node, args.package, args)
        network.trust_node(
            new_node,
            args,
            valid_from=valid_from,
        )
        new_node.verify_certificate_validity_period(
            expected_validity_period_days=DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS
        )
        all_nodes.append(new_node)

    for node in nodes_to_cycle:
        network.retire_node(primary, node)
        if primary == node:
            primary, _ = network.wait_for_new_primary(primary)
        node.stop()

    test_all_nodes_cert_renewal(network, args, valid_from=valid_from)
    test_service_cert_renewal(network, args, valid_from=valid_from)

    LOG.info("Waiting for retired nodes to be automatically removed")
    for node in all_nodes:
        network.wait_for_node_in_store(
            primary,
            node.node_id,
            node_status=ccf.ledger.NodeStatus.TRUSTED
            if node.is_joined() else None,
        )

    if args.check_2tx_reconfig_migration:
        test_migration_2tx_reconfiguration(
            network,
            args,
            initial_is_1tx=False,  # Reconfiguration type added in 2.x
            binary_dir=binary_dir,
            library_dir=library_dir,
            version=version,
            valid_from=valid_from,
        )

    LOG.info("Apply transactions to new nodes only")
    issue_activity_on_live_service(network, args)
    test_random_receipts(network, args, lts=True)
Esempio n. 29
0
def run_code_upgrade_from(
    args,
    from_install_path,
    to_install_path,
    from_version=None,
    to_version=None,
    from_container_image=None,
):
    from_binary_dir, from_library_dir = get_bin_and_lib_dirs_for_install_path(
        from_install_path)
    to_binary_dir, to_library_dir = get_bin_and_lib_dirs_for_install_path(
        to_install_path)

    set_js_args(args, from_install_path, to_install_path)

    jwt_issuer = infra.jwt_issuer.JwtIssuer(
        "https://localhost", refresh_interval=args.jwt_key_refresh_interval_s)
    with jwt_issuer.start_openid_server():
        txs = app.LoggingTxs(jwt_issuer=jwt_issuer)
        with infra.network.network(
                args.nodes,
                binary_directory=from_binary_dir,
                library_directory=from_library_dir,
                pdb=args.pdb,
                txs=txs,
                jwt_issuer=jwt_issuer,
                version=from_version,
        ) as network:
            network.start_and_open(args,
                                   node_container_image=from_container_image)

            old_nodes = network.get_joined_nodes()
            primary, _ = network.find_primary()

            LOG.info("Apply transactions to old service")
            issue_activity_on_live_service(network, args)

            new_code_id = infra.utils.get_code_id(
                args.enclave_type,
                args.oe_binary,
                args.package,
                library_dir=to_library_dir,
            )
            network.consortium.add_new_code(primary, new_code_id)

            # Note: alternate between joining from snapshot and replaying entire ledger
            new_nodes = []
            from_snapshot = True
            for _ in range(0, len(old_nodes)):
                new_node = network.create_node(
                    "local://localhost",
                    binary_dir=to_binary_dir,
                    library_dir=to_library_dir,
                    version=to_version,
                )
                network.join_node(new_node,
                                  args.package,
                                  args,
                                  from_snapshot=from_snapshot)
                network.trust_node(
                    new_node,
                    args,
                    valid_from=str(  # Pre-2.0 nodes require X509 time format
                        infra.crypto.datetime_to_X509time(
                            datetime.datetime.now())),
                )
                # For 2.x nodes joining a 1.x service before the constitution is updated,
                # the node certificate validity period is set by the joining node itself
                # as [node startup time, node startup time + 365 days]
                new_node.verify_certificate_validity_period(
                    expected_validity_period_days=
                    DEFAULT_NODE_CERTIFICATE_VALIDITY_DAYS,
                    ignore_proposal_valid_from=True,
                )
                from_snapshot = not from_snapshot
                new_nodes.append(new_node)

            # Verify that all nodes run the expected CCF version
            for node in network.get_joined_nodes():
                # Note: /node/version endpoint was added in 2.x
                if not node.major_version or node.major_version > 1:
                    with node.client() as c:
                        r = c.get("/node/version")
                        expected_version = node.version or args.ccf_version
                        version = r.body.json()["ccf_version"]
                        assert (
                            version == expected_version
                        ), f"For node {node.local_node_id}, expect version {expected_version}, got {version}"

            LOG.info(
                "Apply transactions to hybrid network, with primary as old node"
            )
            issue_activity_on_live_service(network, args)

            old_code_id = infra.utils.get_code_id(
                args.enclave_type,
                args.oe_binary,
                args.package,
                library_dir=from_library_dir,
            )
            primary, _ = network.find_primary()
            network.consortium.retire_code(primary, old_code_id)

            for index, node in enumerate(old_nodes):
                network.retire_node(primary, node)
                if primary == node:
                    primary, _ = network.wait_for_new_primary(primary)
                    # This block is here to test the transition period from a network that
                    # does not support custom claims to one that does. It can be removed after
                    # the transition is complete.
                    #
                    # The new build, being unreleased, doesn't have a version at all
                    if not primary.major_version:
                        LOG.info("Upgrade to new JS app")
                        # Upgrade to a version of the app containing an endpoint that
                        # registers custom claims
                        network.consortium.set_js_app_from_dir(
                            primary, args.new_js_app_bundle)
                        LOG.info("Run transaction with additional claim")
                        # With wait_for_sync, the client checks that all nodes, including
                        # the minority of old ones, have acked the transaction
                        msg_idx = network.txs.idx + 1
                        txid = network.txs.issue(network,
                                                 number_txs=1,
                                                 record_claim=True,
                                                 wait_for_sync=True)
                        assert len(network.txs.pub[msg_idx]) == 1
                        claims = network.txs.pub[msg_idx][-1]["msg"]

                        LOG.info(
                            "Check receipts are fine, including transaction with claims"
                        )
                        test_random_receipts(
                            network,
                            args,
                            lts=True,
                            additional_seqnos={txid.seqno: claims.encode()},
                        )
                        # Also check receipts on an old node
                        if index + 1 < len(old_nodes):
                            next_node = old_nodes[index + 1]
                            test_random_receipts(
                                network,
                                args,
                                lts=True,
                                additional_seqnos={txid.seqno: None},
                                node=next_node,
                            )
                node.stop()

            LOG.info("Service is now made of new nodes only")

            # Rollover JWKS so that new primary must read historical CA bundle table
            # and retrieve new keys via auto refresh
            if not os.getenv("CONTAINER_NODES"):
                jwt_issuer.refresh_keys()
                # Note: /gov/jwt_keys/all endpoint was added in 2.x
                primary, _ = network.find_nodes()
                if not primary.major_version or primary.major_version > 1:
                    jwt_issuer.wait_for_refresh(network)
                else:
                    time.sleep(3)
            else:
                # https://github.com/microsoft/CCF/issues/2608#issuecomment-924785744
                LOG.warning(
                    "Skipping JWT refresh as running nodes in container")

            # Code update from 1.x to 2.x requires cycling the freshly-added 2.x nodes
            # once. This is because 2.x nodes will not have an endorsed certificate
            # recorded in the store and thus will not be able to have their certificate
            # refreshed, etc.
            test_new_service(
                network,
                args,
                to_install_path,
                to_binary_dir,
                to_library_dir,
                to_version,
                cycle_existing_nodes=True,
            )

            # Check that the ledger can be parsed
            network.get_latest_ledger_public_state()
Esempio n. 30
0
def test_update_all_nodes(network, args):
    replacement_package = get_replacement_package(args)

    primary, _ = network.find_nodes()

    first_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary,
                                            args.package)
    new_code_id = infra.utils.get_code_id(args.enclave_type, args.oe_binary,
                                          replacement_package)

    if args.enclave_type == "virtual":
        # Pretend this was already present
        network.consortium.add_new_code(primary, first_code_id)

    LOG.info("Add new code id")
    network.consortium.add_new_code(primary, new_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {
                    "digest": first_code_id,
                    "status": "AllowedToJoin"
                },
                {
                    "digest": new_code_id,
                    "status": "AllowedToJoin"
                },
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    LOG.info("Remove old code id")
    network.consortium.retire_code(primary, first_code_id)
    with primary.client() as uc:
        r = uc.get("/node/code")
        versions = sorted(r.body.json()["versions"], key=lambda x: x["digest"])
        expected = sorted(
            [
                {
                    "digest": new_code_id,
                    "status": "AllowedToJoin"
                },
            ],
            key=lambda x: x["digest"],
        )
        assert versions == expected, versions

    old_nodes = network.nodes.copy()

    LOG.info("Start fresh nodes running new code")
    for _ in range(0, len(old_nodes)):
        new_node = network.create_node("local://localhost")
        network.join_node(new_node, replacement_package, args)
        network.trust_node(new_node, args)

    LOG.info("Retire original nodes running old code")
    for node in old_nodes:
        primary, _ = network.find_nodes()
        network.retire_node(primary, node)
        # Elections take (much) longer than a backup removal which is just
        # a commit, so we need to adjust our timeout accordingly, hence this branch
        if node.node_id == primary.node_id:
            new_primary, _ = network.wait_for_new_primary(primary)
            primary = new_primary
        node.stop()

    LOG.info("Check the network is still functional")
    check_can_progress(new_node)
    return network