Ejemplo n.º 1
0
 def wait_for_new_primary(self, old_primary, nodes=None, timeout_multiplier=2):
     # We arbitrarily pick twice the election duration to protect ourselves against the somewhat
     # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
     timeout = self.election_duration * timeout_multiplier
     LOG.info(
         f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..."
     )
     end_time = time.time() + timeout
     error = TimeoutError
     logs = []
     while time.time() < end_time:
         try:
             logs = []
             new_primary, new_term = self.find_primary(nodes=nodes, log_capture=logs)
             if new_primary.node_id != old_primary.node_id:
                 flush_info(logs, None)
                 LOG.info(
                     f"New primary is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
                 )
                 return (new_primary, new_term)
         except PrimaryNotFound:
             error = PrimaryNotFound
         except Exception:
             pass
         time.sleep(0.1)
     flush_info(logs, None)
     raise error(f"A new primary was not elected after {timeout} seconds")
Ejemplo n.º 2
0
 def wait_for_new_primary_in(
     self,
     expected_node_ids,
     nodes=None,
     timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER,
 ):
     # We arbitrarily pick twice the election duration to protect ourselves against the somewhat
     # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
     timeout = self.observed_election_duration * timeout_multiplier
     LOG.info(
         f"Waiting up to {timeout}s for a new primary in {expected_node_ids} to be elected..."
     )
     start_time = time.time()
     end_time = start_time + timeout
     error = TimeoutError
     logs = []
     while time.time() < end_time:
         try:
             logs = []
             new_primary, new_term = self.find_primary(nodes=nodes,
                                                       log_capture=logs)
             if new_primary.node_id in expected_node_ids:
                 flush_info(logs, None)
                 delay = time.time() - start_time
                 LOG.info(
                     f"New primary after {delay}s is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
                 )
                 return (new_primary, new_term)
         except PrimaryNotFound:
             error = PrimaryNotFound
         except Exception:
             pass
         time.sleep(0.1)
     flush_info(logs, None)
     raise error(f"A new primary was not elected after {timeout} seconds")
Ejemplo n.º 3
0
    def wait_for_all_nodes_to_commit(self, primary=None, tx_id=None, timeout=10):
        """
        Wait for all nodes to have joined the network and committed all transactions
        executed on the primary.
        """
        if not (primary or tx_id):
            raise ValueError("Either a valid TxID or primary node should be specified")

        end_time = time.time() + timeout

        # If no TxID is specified, retrieve latest readable one
        if tx_id == None:
            while time.time() < end_time:
                with primary.client() as c:
                    resp = c.get(
                        "/node/network/nodes/self"
                    )  # Well-known read-only endpoint
                    tx_id = TxID(resp.view, resp.seqno)
                    if tx_id.valid():
                        break
                time.sleep(0.1)
            assert (
                tx_id.valid()
            ), f"Primary {primary.node_id} has not made any progress yet ({tx_id})"

        caught_up_nodes = []
        logs = {}
        while time.time() < end_time:
            caught_up_nodes = []
            for node in self.get_joined_nodes():
                with node.client() as c:
                    logs[node.node_id] = []
                    resp = c.get(
                        f"/node/local_tx?transaction_id={tx_id}",
                        log_capture=logs[node.node_id],
                    )
                    if resp.status_code != 200:
                        # Node may not have joined the network yet, try again
                        break
                    status = TxStatus(resp.body.json()["status"])
                    if status == TxStatus.Committed:
                        caught_up_nodes.append(node)
                    elif status == TxStatus.Invalid:
                        flush_info(logs[node.node_id], None, 0)
                        raise RuntimeError(
                            f"Node {node.node_id} reports transaction ID {tx_id} is invalid and will never be committed"
                        )
                    else:
                        pass

            if len(caught_up_nodes) == len(self.get_joined_nodes()):
                break
            time.sleep(0.1)

        for lines in logs.values():
            flush_info(lines, None, 0)
        assert len(caught_up_nodes) == len(
            self.get_joined_nodes()
        ), f"Only {len(caught_up_nodes)} (out of {len(self.get_joined_nodes())}) nodes have joined the network"
Ejemplo n.º 4
0
    def wait_for_new_primary(self,
                             old_primary,
                             nodes=None,
                             timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER):
        # We arbitrarily pick twice the election duration to protect ourselves against the somewhat
        # but not that rare cases when the first round of election fails (short timeout are particularly susceptible to this)
        timeout = self.observed_election_duration * timeout_multiplier
        LOG.info(
            f"Waiting up to {timeout}s for a new primary different from {old_primary.local_node_id} ({old_primary.node_id}) to be elected..."
        )
        start_time = time.time()
        end_time = start_time + timeout
        error = TimeoutError
        logs = []

        backup = self.find_any_backup(old_primary)
        if backup.get_consensus() == "bft":
            try:
                with backup.client("user0") as c:
                    _ = c.post(
                        "/app/log/private",
                        {
                            "id": -1,
                            "msg": "This is submitted to force a view change"
                        },
                    )
                time.sleep(1)
            except CCFConnectionException:
                LOG.warning(
                    f"Could not successfully connect to node {backup.node_id}."
                )

        while time.time() < end_time:
            try:
                logs = []
                new_primary, new_term = self.find_primary(nodes=nodes,
                                                          log_capture=logs)
                if new_primary.node_id != old_primary.node_id:
                    flush_info(logs, None)
                    delay = time.time() - start_time
                    LOG.info(
                        f"New primary after {delay}s is {new_primary.local_node_id} ({new_primary.node_id}) in term {new_term}"
                    )
                    return (new_primary, new_term)
            except PrimaryNotFound:
                error = PrimaryNotFound
            except Exception:
                pass
            time.sleep(0.1)
        flush_info(logs, None)
        raise error(f"A new primary was not elected after {timeout} seconds")
Ejemplo n.º 5
0
    def find_primary(self, nodes=None, timeout=3, log_capture=None):
        """
        Find the identity of the primary in the network and return its identity
        and the current view.
        """
        primary_id = None
        view = None

        logs = []

        asked_nodes = nodes or self.get_joined_nodes()
        end_time = time.time() + timeout
        while time.time() < end_time:
            for node in asked_nodes:
                with node.client() as c:
                    try:
                        logs = []
                        res = c.get("/node/network",
                                    timeout=1,
                                    log_capture=logs)
                        assert res.status_code == http.HTTPStatus.OK.value, res

                        body = res.body.json()
                        view = body["current_view"]
                        primary_id = body["primary_id"]
                        if primary_id is not None:
                            break

                    except Exception:
                        LOG.warning(
                            f"Could not successfully connect to node {node.local_node_id}. Retrying..."
                        )

            if primary_id is not None:
                break
            time.sleep(0.1)

        if primary_id is None:
            flush_info(logs, log_capture, 0)
            raise PrimaryNotFound

        flush_info(logs, log_capture, 0)

        return (self._get_node_by_service_id(primary_id), view)
Ejemplo n.º 6
0
    def wait_for_primary_unanimity(
            self,
            timeout_multiplier=DEFAULT_TIMEOUT_MULTIPLIER,
            min_view=None):
        timeout = self.observed_election_duration * timeout_multiplier
        LOG.info(
            f"Waiting up to {timeout}s for all nodes to agree on the primary")
        start_time = time.time()
        end_time = start_time + timeout

        primaries = []
        while time.time() < end_time:
            primaries = []
            for node in self.get_joined_nodes():
                logs = []
                try:
                    primary, view = self.find_primary(nodes=[node],
                                                      log_capture=logs)
                    if min_view is None or view > min_view:
                        primaries.append(primary)
                except PrimaryNotFound:
                    pass
            # Stop checking once all primaries are the same
            if (len(self.get_joined_nodes()) == len(primaries)
                    and len(set(primaries)) <= 1):
                break
            time.sleep(0.1)
        all_good = (len(self.get_joined_nodes()) == len(primaries)
                    and len(set(primaries)) <= 1)
        if not all_good:
            flush_info(logs)
            for node in self.get_joined_nodes():
                with node.client() as c:
                    r = c.get("/node/consensus")
                    pprint.pprint(r.body.json())
        assert all_good, f"Multiple primaries: {primaries}"
        delay = time.time() - start_time
        LOG.info(
            f"Primary unanimity after {delay}s: {primaries[0].local_node_id} ({primaries[0].node_id})"
        )
        return primaries[0]
Ejemplo n.º 7
0
    def find_primary(self, timeout=3, log_capture=None):
        """
        Find the identity of the primary in the network and return its identity
        and the current view.
        """
        primary_id = None
        view = None

        logs = []

        end_time = time.time() + timeout
        while time.time() < end_time:
            for node in self.get_joined_nodes():
                with node.client() as c:
                    try:
                        logs = []
                        res = c.get("/node/network", log_capture=logs)
                        assert res.status_code == 200, res
                        body = res.body.json()
                        primary_id = body["primary_id"]
                        view = body["current_view"]
                        view_change_in_progress = body[
                            "view_change_in_progress"]
                        if primary_id is not None:
                            break

                    except CCFConnectionException:
                        LOG.warning(
                            f"Could not successfully connect to node {node.node_id}. Retrying..."
                        )
            if primary_id is not None:
                break
            time.sleep(0.1)

        if primary_id is None or view_change_in_progress:
            flush_info(logs, log_capture, 0)
            raise PrimaryNotFound

        flush_info(logs, log_capture, 0)
        return (self._get_node_by_id(primary_id), view)
Ejemplo n.º 8
0
    def stop_all_nodes(self,
                       skip_verification=False,
                       verbose_verification=False):
        if not skip_verification:
            if self.txs is not None:
                LOG.info(
                    "Verifying that all committed txs can be read before shutdown"
                )
                log_capture = []
                self.txs.verify(self, log_capture=log_capture)
                if verbose_verification:
                    flush_info(log_capture, None)

        fatal_error_found = False

        for node in self.nodes:
            _, fatal_errors = node.stop()
            if fatal_errors:
                fatal_error_found = True

        LOG.info("All nodes stopped")

        if not skip_verification:
            longest_ledger_seqno = 0
            most_up_to_date_node = None
            committed_ledger_dirs = {}

            for node in self.nodes:
                # Find stopped node with longest ledger
                _, committed_ledger_dir = node.get_ledger(
                    include_read_only_dirs=True)
                ledger_end_seqno = 0
                for ledger_file in os.listdir(committed_ledger_dir):
                    end_seqno = infra.node.get_committed_ledger_end_seqno(
                        ledger_file)
                    if end_seqno > ledger_end_seqno:
                        ledger_end_seqno = end_seqno

                if ledger_end_seqno > longest_ledger_seqno:
                    longest_ledger_seqno = ledger_end_seqno
                    most_up_to_date_node = node
                committed_ledger_dirs[node.local_node_id] = [
                    committed_ledger_dir,
                    ledger_end_seqno,
                ]

            # Verify that all ledger files on stopped nodes exist on most up-to-date node
            # and are identical
            if most_up_to_date_node:
                longest_ledger_dir, _ = committed_ledger_dirs[
                    most_up_to_date_node.local_node_id]
                for node_id, (committed_ledger_dir, _) in (
                        l for l in committed_ledger_dirs.items()
                        if not l[0] == most_up_to_date_node.node_id):
                    for ledger_file in os.listdir(committed_ledger_dir):
                        if ledger_file not in os.listdir(longest_ledger_dir):
                            raise Exception(
                                f"Ledger file on node {node_id} does not exist on most up-to-date node {most_up_to_date_node.local_node_id}: {ledger_file}"
                            )
                        if infra.path.compute_file_checksum(
                                os.path.join(longest_ledger_dir, ledger_file)
                        ) != infra.path.compute_file_checksum(
                                os.path.join(committed_ledger_dir,
                                             ledger_file)):
                            raise Exception(
                                f"Ledger file checksums between node {node_id} and most up-to-date node {most_up_to_date_node.node_id} did not match: {ledger_file}"
                            )

                LOG.success(
                    f"Verified ledger files consistency on all {len(self.nodes)} stopped nodes"
                )

        if fatal_error_found:
            if self.ignoring_shutdown_errors:
                LOG.warning("Ignoring shutdown errors")
            else:
                raise NodeShutdownError(
                    "Fatal error found during node shutdown")