Esempio n. 1
0
def wait_for_seqno_to_commit(seqno, view, nodes):
    """
    Wait for a specific seqno at a specific view to be committed on all nodes.
    """
    for _ in range(infra.ccf.Network.replication_delay * 10):
        up_to_date_f = []
        for f in nodes:
            with f.node_client() as c:
                r = c.get("tx", {"view": view, "seqno": seqno})
                assert (r.status == http.HTTPStatus.OK
                        ), f"tx request returned HTTP status {r.status}"
                status = TxStatus(r.result["status"])
                if status == TxStatus.Committed:
                    up_to_date_f.append(f.node_id)
                elif status == TxStatus.Invalid:
                    raise RuntimeError(
                        f"Node {f.node_id} reports transaction ID {view}.{seqno} is invalid and will never be committed"
                    )
                else:
                    pass
        if len(up_to_date_f) == len(nodes):
            break
        time.sleep(0.1)
    assert len(up_to_date_f) == len(
        nodes), "Only {} out of {} nodes are up to date".format(
            len(up_to_date_f), len(nodes))
Esempio n. 2
0
def wait_for_global_commit(node_client, seqno, view, mksign=False, timeout=3):
    """
    Given a client to a CCF network and a seqno/view pair, this function
    waits for this specific commit index to be globally committed by the
    network in this view.
    A TimeoutError exception is raised if the commit index is not globally
    committed within the given timeout.
    """
    # Waiting for a global commit can significantly slow down tests as
    # signatures take some time to be emitted and globally committed.
    # Forcing a signature accelerates this process for common operations
    # (e.g. governance proposals)
    if mksign:
        r = node_client.rpc("mkSign")
        if r.error is not None:
            raise RuntimeError(f"mkSign returned an error: {r.error}")

    end_time = time.time() + timeout
    while time.time() < end_time:
        r = node_client.get("tx", {"view": view, "seqno": seqno})
        assert (r.status == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status}"
        status = TxStatus(r.result["status"])
        if status == TxStatus.Committed:
            return
        elif status == TxStatus.Invalid:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is marked invalid and will never be committed"
            )
        else:
            time.sleep(0.1)
    raise TimeoutError("Timed out waiting for commit")
Esempio n. 3
0
def test_isolate_and_reconnect_primary(network, args, **kwargs):
    primary, backups = network.find_nodes()
    with network.partitioner.partition(backups):
        lost_tx_resp = check_does_not_progress(primary)

        new_primary, _ = network.wait_for_new_primary(primary,
                                                      nodes=backups,
                                                      timeout_multiplier=6)
        new_tx_resp = check_can_progress(new_primary)

    # Check reconnected former primary has caught up
    with primary.client() as c:
        try:
            # There will be at least one full election cycle for nothing, where the
            # re-joining node fails to get elected but causes others to rev up their
            # term. After that, a successful election needs to take place, and we
            # arbitrarily allow 3 time periods to avoid being too brittle when
            # raft timeouts line up badly.
            c.wait_for_commit(new_tx_resp,
                              timeout=(network.election_duration * 4))
        except TimeoutError:
            details = c.get("/node/consensus").body.json()
            assert (
                False
            ), f"Stuck before {new_tx_resp.view}.{new_tx_resp.seqno}: {pprint.pformat(details)}"

        # Check it has dropped anything submitted while partitioned
        r = c.get(
            f"/node/tx?transaction_id={lost_tx_resp.view}.{lost_tx_resp.seqno}"
        )
        status = TxStatus(r.body.json()["status"])
        assert status == TxStatus.Invalid, r
Esempio n. 4
0
    def wait_for_all_nodes_to_catch_up(self, primary, timeout=3):
        """
        Wait for all nodes to have joined the network and globally replicated
        all transactions globally executed on the primary (including transactions
        which added the nodes).
        """
        end_time = time.time() + timeout
        while time.time() < end_time:
            with primary.node_client() as c:
                resp = c.get("getCommit")
                commit_leader = resp.result["commit"]
                term_leader = resp.result["term"]
                if commit_leader != 0:
                    break
            time.sleep(0.1)
        assert (
            commit_leader != 0
        ), f"Primary {primary.node_id} has not made any progress yet (term: {term_leader}, commit: {commit_leader})"

        while time.time() < end_time:
            caught_up_nodes = []
            for node in self.get_joined_nodes():
                with node.node_client() as c:
                    resp = c.get(
                        "tx",
                        {
                            "view": term_leader,
                            "seqno": commit_leader
                        },
                    )
                    if resp.error is not None:
                        # Node may not have joined the network yet, try again
                        break
                    status = TxStatus(resp.result["status"])
                    if status == TxStatus.Committed:
                        caught_up_nodes.append(node)
                    elif status == TxStatus.Invalid:
                        raise RuntimeError(
                            f"Node {node.node_id} reports transaction ID {term_leader}.{commit_leader} is invalid and will never be committed"
                        )
                    else:
                        pass

            if len(caught_up_nodes) == len(self.get_joined_nodes()):
                break
            time.sleep(0.1)
        assert len(caught_up_nodes) == len(
            self.get_joined_nodes()
        ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
Esempio n. 5
0
def wait_for_pending(client, view, seqno, timeout=3):
    end_time = time.time() + timeout
    while time.time() < end_time:
        r = client.get(f"/node/tx?transaction_id={view}.{seqno}")
        assert (r.status_code == http.HTTPStatus.OK
                ), f"tx request returned HTTP status {r.status_code}"
        status = TxStatus(r.body.json()["status"])
        if status == TxStatus.Pending:
            return
        elif status == TxStatus.Invalid:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is marked invalid and will never be committed"
            )
        elif status == TxStatus.Committed:
            raise RuntimeError(
                f"Transaction ID {view}.{seqno} is unexpectedly marked committed"
            )
        else:
            time.sleep(0.1)
    raise TimeoutError("Timed out waiting for commit")
Esempio n. 6
0
def test_tx_statuses(network, args):
    primary, _ = network.find_primary()

    with primary.user_client() as c:
        check = infra.checker.Checker()
        r = c.rpc("log/private", {"id": 0, "msg": "Ignored"})
        check(r)
        # Until this tx is globally committed, poll for the status of this and some other
        # related transactions around it (and also any historical transactions we're tracking)
        target_view = r.view
        target_seqno = r.seqno
        SentTxs.update_status(target_view, target_seqno)
        SentTxs.update_status(target_view, target_seqno + 1)
        SentTxs.update_status(target_view - 1, target_seqno, TxStatus.Invalid)

        end_time = time.time() + 10
        while True:
            if time.time() > end_time:
                raise TimeoutError(
                    f"Took too long waiting for global commit of {target_view}.{target_seqno}"
                )

            done = False
            for view, seqno in SentTxs.get_all_tx_ids():
                r = c.get("tx", {"view": view, "seqno": seqno})
                check(r)
                status = TxStatus(r.result["status"])
                SentTxs.update_status(view, seqno, status)
                if (status == TxStatus.Committed and target_view == view
                        and target_seqno == seqno):
                    done = True

            if done:
                break
            time.sleep(0.1)

    return network
Esempio n. 7
0
def test_view_history(network, args):
    if args.consensus == "pbft":
        # This appears to work in PBFT, but it is unacceptably slow:
        # - Each /tx request is a write, with a non-trivial roundtrip response time
        # - Since each read (eg - /tx and /commit) has produced writes and a unique tx ID,
        #    there are too many IDs to test exhaustively
        # We could rectify this by making this test non-exhaustive (bisecting for view changes,
        # sampling within a view), but for now it is exhaustive and Raft-only
        LOG.warning("Skipping view reconstruction in PBFT")
        return network

    check = infra.checker.Checker()

    for node in network.get_joined_nodes():
        with node.user_client() as c:
            r = c.get("commit")
            check(c)

            commit_view = r.view
            commit_seqno = r.global_commit

            # Temporarily disable logging of RPCs for readability
            rpc_loggers = c.rpc_loggers
            c.rpc_loggers = ()
            LOG.warning("RPC logging temporarily suppressed")

            # Retrieve status for all possible Tx IDs
            seqno_to_views = {}
            for seqno in range(1, commit_seqno + 1):
                views = []
                for view in range(1, commit_view + 1):
                    r = c.get("tx", {"view": view, "seqno": seqno})
                    check(r)
                    status = TxStatus(r.result["status"])
                    if status == TxStatus.Committed:
                        views.append(view)
                seqno_to_views[seqno] = views

            c.rpc_loggers = rpc_loggers
            LOG.warning("RPC logging restored")

            # Check we have exactly one Tx ID for each seqno
            txs_ok = True
            for seqno, views in seqno_to_views.items():
                if len(views) != 1:
                    txs_ok = False
                    LOG.error(
                        f"Node {node.node_id}: Found {len(views)} committed Tx IDs for seqno {seqno}"
                    )

            tx_ids_condensed = ", ".join(
                " OR ".join(f"{view}.{seqno}" for view in views or ["UNKNOWN"])
                for seqno, views in seqno_to_views.items())

            if txs_ok:
                LOG.success(
                    f"Node {node.node_id}: Found a valid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
            else:
                LOG.error(
                    f"Node {node.node_id}: Invalid sequence of Tx IDs:\n{tx_ids_condensed}"
                )
                raise RuntimeError(
                    f"Node {node.node_id}: Incomplete or inconsistent view history"
                )

    return network