Example #1
0
    async def _explore(
        self,
        node_id: NodeID,
        max_distance: int,
    ) -> None:
        """
        Explore the neighborhood around the given `node_id` out to the
        specified `max_distance`
        """
        async with trio.open_nursery() as nursery:
            for distances in partition_all(2, range(max_distance, 0, -1)):
                try:
                    found_enrs = await self._network.find_nodes(
                        node_id, *distances)
                except trio.TooSlowError:
                    self.unresponsive.add(node_id)
                    return
                except MissingEndpointFields:
                    self.unreachable.add(node_id)
                    return
                except ValidationError:
                    self.invalid.add(node_id)
                    return
                else:
                    # once we encounter a pair of buckets that elicits an empty
                    # response we assume that all subsequent buckets will also
                    # be empty.
                    if not found_enrs:
                        self.logger.debug(
                            "explore-finish: node_id=%s  covered=%d-%d",
                            node_id.hex(),
                            max_distance,
                            distances[0],
                        )
                        break

                for enr in found_enrs:
                    try:
                        self._network.enr_db.set_enr(enr)
                    except OldSequenceNumber:
                        pass

                # check if we have found any new records.  If so, queue them and
                # wake up the new workers.  This is guarded by the `condition`
                # object to ensure we maintain a consistent view of the `seen`
                # nodes.
                async with self._condition:
                    new_enrs = tuple(enr for enr in reduce_enrs(found_enrs)
                                     if enr.node_id not in self.seen)

                    if new_enrs:
                        self.seen.update(enr.node_id for enr in new_enrs)
                        self._condition.notify_all()

                # use the `NetworkProtocol.bond` to perform a liveliness check
                for enr in new_enrs:
                    nursery.start_soon(self._bond_then_send, enr)
Example #2
0
    async def bond(
        self, node_id: NodeID, *, endpoint: Optional[Endpoint] = None
    ) -> bool:
        self.logger.debug2(
            "Bonding with %s", node_id.hex(),
        )

        try:
            pong = await self.ping(node_id, endpoint=endpoint)
        except trio.TooSlowError:
            self.logger.debug("Bonding with %s timed out during ping", node_id.hex())
            return False
        except KeyError:
            self.logger.debug(
                "Unable to lookup endpoint information for node: %s", node_id.hex()
            )
            return False

        try:
            enr = await self.lookup_enr(
                node_id, enr_seq=pong.enr_seq, endpoint=endpoint
            )
        except trio.TooSlowError:
            self.logger.debug(
                "Bonding with %s timed out during ENR retrieval", node_id.hex(),
            )
            return False

        self.routing_table.update(enr.node_id)

        self.logger.debug(
            "Bonded with %s successfully", node_id.hex(),
        )

        self._routing_table_ready.set()
        return True
Example #3
0
    async def bond(
        self, node_id: NodeID, *, endpoint: Optional[Endpoint] = None
    ) -> bool:
        self.logger.debug2(
            "Bonding with %s", node_id.hex(),
        )

        try:
            pong = await self.ping(node_id, endpoint=endpoint)
        except trio.TooSlowError:
            self.logger.debug("Bonding with %s timed out during ping", node_id.hex())
            return False
        except MissingEndpointFields:
            self.logger.debug(
                "Bonding with %s failed due to missing endpoint information",
                node_id.hex(),
            )
            return False

        try:
            enr = await self.lookup_enr(
                node_id, enr_seq=pong.enr_seq, endpoint=endpoint
            )
        except trio.TooSlowError:
            self.logger.debug(
                "Bonding with %s timed out during ENR retrieval", node_id.hex(),
            )
            return False
        except EmptyFindNodesResponse:
            self.logger.debug(
                "Bonding with %s failed due to them not returing their ENR record",
                node_id.hex(),
            )
            return False

        self.routing_table.update(enr.node_id)

        self.logger.debug(
            "Bonded with %s successfully", node_id.hex(),
        )

        self._routing_table_ready.set()
        return True
Example #4
0
async def common_recursive_find_nodes(
    network: NetworkProtocol,
    target: NodeID,
    *,
    concurrency: int = 3,
    unresponsive_cache: Dict[NodeID, float] = UNRESPONSIVE_CACHE,
) -> AsyncIterator[trio.abc.ReceiveChannel[ENRAPI]]:
    """
    An optimized version of the recursive lookup algorithm for a kademlia
    network.

    Continually lookup nodes in the target part of the network, keeping track
    of all of the nodes we have seen.

    Exit once we have queried all of the `k` closest nodes to the target.

    The concurrency structure here is optimized to minimize the effect of
    unresponsive nodes on the total time it takes to perform the recursive
    lookup.  Some requests will hang for up to 10 seconds.  The
    `adaptive_timeout` combined with the multiple concurrent workers helps
    mitigate the overall slowdown caused by a few unresponsive nodes since the
    other queries can be issues concurrently.
    """
    network.logger.debug2("Recursive find nodes: %s", target.hex())
    start_at = trio.current_time()

    # The set of NodeID values we have already queried.
    queried_node_ids: Set[NodeID] = set()

    # The set of NodeID that timed out
    #
    # The `local_node_id` is
    # included in this as a convenience mechanism so that we don't have to
    # continually fiter it out of the various filters
    unresponsive_node_ids: Set[NodeID] = {network.local_node_id}

    # We maintain a cache of nodes that were recently deemed unresponsive
    # within the last 10 minutes.
    unresponsive_node_ids.update(
        node_id
        for node_id, last_unresponsive_at in unresponsive_cache.items()
        if trio.current_time() - last_unresponsive_at < 300
    )

    # Accumulator of the node_ids we have seen
    received_node_ids: Set[NodeID] = set()

    # Tracker for node_ids that are actively being requested.
    in_flight: Set[NodeID] = set()

    condition = trio.Condition()

    def get_unqueried_node_ids() -> Tuple[NodeID, ...]:
        """
        Get the three nodes that are closest to the target such that the node
        is in the closest `k` nodes which haven't been deemed unresponsive.
        """
        # Construct an iterable of *all* the nodes we know about ordered by
        # closeness to the target.
        candidates = iter_closest_nodes(
            target, network.routing_table, received_node_ids
        )
        # Remove any unresponsive nodes from that iterable
        responsive_candidates = itertools.filterfalse(
            lambda node_id: node_id in unresponsive_node_ids, candidates
        )
        # Grab the closest K
        closest_k_candidates = take(
            network.routing_table.bucket_size, responsive_candidates,
        )
        # Filter out any from the closest K that we've already queried or that are in-flight
        closest_k_unqueried = itertools.filterfalse(
            lambda node_id: node_id in queried_node_ids or node_id in in_flight,
            closest_k_candidates,
        )

        return tuple(take(3, closest_k_unqueried))

    async def do_lookup(
        node_id: NodeID, send_channel: trio.abc.SendChannel[ENRAPI]
    ) -> None:
        """
        Perform an individual lookup on the target part of the network from the
        given `node_id`
        """
        if node_id == target:
            distance = 0
        else:
            distance = compute_log_distance(node_id, target)

        try:
            found_enrs = await network.find_nodes(node_id, distance)
        except (trio.TooSlowError, MissingEndpointFields, ValidationError):
            unresponsive_node_ids.add(node_id)
            unresponsive_cache[node_id] = trio.current_time()
            return
        except trio.Cancelled:
            # We don't add these to the unresponsive cache since they didn't
            # necessarily exceed the fulle 10s request/response timeout.
            unresponsive_node_ids.add(node_id)
            raise

        for enr in found_enrs:
            try:
                network.enr_db.set_enr(enr)
            except OldSequenceNumber:
                pass

        async with condition:
            new_enrs = tuple(
                enr for enr in found_enrs if enr.node_id not in received_node_ids
            )
            received_node_ids.update(enr.node_id for enr in new_enrs)

        for enr in new_enrs:
            try:
                await send_channel.send(enr)
            except (trio.BrokenResourceError, trio.ClosedResourceError):
                # In the event that the consumer of `recursive_find_nodes`
                # exits early before the lookup has completed we can end up
                # operating on a closed channel.
                return

    async def worker(
        worker_id: NodeID, send_channel: trio.abc.SendChannel[ENRAPI]
    ) -> None:
        """
        Pulls unqueried nodes from the closest k nodes and performs a
        concurrent lookup on them.
        """
        for round in itertools.count():
            async with condition:
                node_ids = get_unqueried_node_ids()

                if not node_ids:
                    await condition.wait()
                    continue

                # Mark the node_ids as having been queried.
                queried_node_ids.update(node_ids)
                # Mark the node_ids as being in-flight.
                in_flight.update(node_ids)

                # Some of the node ids may have come from our routing table.
                # These won't be present in the `received_node_ids` so we
                # detect this here and send them over the channel.
                try:
                    for node_id in node_ids:
                        if node_id not in received_node_ids:
                            enr = network.enr_db.get_enr(node_id)
                            received_node_ids.add(node_id)
                            await send_channel.send(enr)
                except (trio.BrokenResourceError, trio.ClosedResourceError):
                    # In the event that the consumer of `recursive_find_nodes`
                    # exits early before the lookup has completed we can end up
                    # operating on a closed channel.
                    return

            if len(node_ids) == 1:
                await do_lookup(node_ids[0], send_channel)
            else:
                tasks = tuple(
                    (do_lookup, (node_id, send_channel)) for node_id in node_ids
                )
                try:
                    await adaptive_timeout(*tasks, threshold=1, variance=2.0)
                except trio.TooSlowError:
                    pass

            async with condition:
                # Remove the `node_ids` from the in_flight set.
                in_flight.difference_update(node_ids)

                condition.notify_all()

    async def _monitor_done(send_channel: trio.abc.SendChannel[ENRAPI]) -> None:
        async with send_channel:
            async with condition:
                while True:
                    # this `fail_after` is a failsafe to prevent deadlock situations
                    # which are possible with `Condition` objects.
                    with trio.move_on_after(60) as scope:
                        node_ids = get_unqueried_node_ids()

                        if not node_ids and not in_flight:
                            break
                        else:
                            await condition.wait()

                    if scope.cancelled_caught:
                        network.logger.error("Deadlock")

    send_channel, receive_channel = trio.open_memory_channel[ENRAPI](256)

    async with trio.open_nursery() as nursery:
        nursery.start_soon(_monitor_done, send_channel)

        for worker_id in range(concurrency):
            nursery.start_soon(worker, worker_id, send_channel)

        async with receive_channel:
            yield receive_channel

        nursery.cancel_scope.cancel()

    elapsed = trio.current_time() - start_at

    network.logger.debug(
        "Lookup for %s finished in %f seconds: seen=%d  queried=%d  unresponsive=%d",
        target.hex(),
        elapsed,
        len(received_node_ids),
        len(queried_node_ids),
        len(unresponsive_node_ids),
    )