async def _explore( self, node_id: NodeID, max_distance: int, ) -> None: """ Explore the neighborhood around the given `node_id` out to the specified `max_distance` """ async with trio.open_nursery() as nursery: for distances in partition_all(2, range(max_distance, 0, -1)): try: found_enrs = await self._network.find_nodes( node_id, *distances) except trio.TooSlowError: self.unresponsive.add(node_id) return except MissingEndpointFields: self.unreachable.add(node_id) return except ValidationError: self.invalid.add(node_id) return else: # once we encounter a pair of buckets that elicits an empty # response we assume that all subsequent buckets will also # be empty. if not found_enrs: self.logger.debug( "explore-finish: node_id=%s covered=%d-%d", node_id.hex(), max_distance, distances[0], ) break for enr in found_enrs: try: self._network.enr_db.set_enr(enr) except OldSequenceNumber: pass # check if we have found any new records. If so, queue them and # wake up the new workers. This is guarded by the `condition` # object to ensure we maintain a consistent view of the `seen` # nodes. async with self._condition: new_enrs = tuple(enr for enr in reduce_enrs(found_enrs) if enr.node_id not in self.seen) if new_enrs: self.seen.update(enr.node_id for enr in new_enrs) self._condition.notify_all() # use the `NetworkProtocol.bond` to perform a liveliness check for enr in new_enrs: nursery.start_soon(self._bond_then_send, enr)
async def bond( self, node_id: NodeID, *, endpoint: Optional[Endpoint] = None ) -> bool: self.logger.debug2( "Bonding with %s", node_id.hex(), ) try: pong = await self.ping(node_id, endpoint=endpoint) except trio.TooSlowError: self.logger.debug("Bonding with %s timed out during ping", node_id.hex()) return False except KeyError: self.logger.debug( "Unable to lookup endpoint information for node: %s", node_id.hex() ) return False try: enr = await self.lookup_enr( node_id, enr_seq=pong.enr_seq, endpoint=endpoint ) except trio.TooSlowError: self.logger.debug( "Bonding with %s timed out during ENR retrieval", node_id.hex(), ) return False self.routing_table.update(enr.node_id) self.logger.debug( "Bonded with %s successfully", node_id.hex(), ) self._routing_table_ready.set() return True
async def bond( self, node_id: NodeID, *, endpoint: Optional[Endpoint] = None ) -> bool: self.logger.debug2( "Bonding with %s", node_id.hex(), ) try: pong = await self.ping(node_id, endpoint=endpoint) except trio.TooSlowError: self.logger.debug("Bonding with %s timed out during ping", node_id.hex()) return False except MissingEndpointFields: self.logger.debug( "Bonding with %s failed due to missing endpoint information", node_id.hex(), ) return False try: enr = await self.lookup_enr( node_id, enr_seq=pong.enr_seq, endpoint=endpoint ) except trio.TooSlowError: self.logger.debug( "Bonding with %s timed out during ENR retrieval", node_id.hex(), ) return False except EmptyFindNodesResponse: self.logger.debug( "Bonding with %s failed due to them not returing their ENR record", node_id.hex(), ) return False self.routing_table.update(enr.node_id) self.logger.debug( "Bonded with %s successfully", node_id.hex(), ) self._routing_table_ready.set() return True
async def common_recursive_find_nodes( network: NetworkProtocol, target: NodeID, *, concurrency: int = 3, unresponsive_cache: Dict[NodeID, float] = UNRESPONSIVE_CACHE, ) -> AsyncIterator[trio.abc.ReceiveChannel[ENRAPI]]: """ An optimized version of the recursive lookup algorithm for a kademlia network. Continually lookup nodes in the target part of the network, keeping track of all of the nodes we have seen. Exit once we have queried all of the `k` closest nodes to the target. The concurrency structure here is optimized to minimize the effect of unresponsive nodes on the total time it takes to perform the recursive lookup. Some requests will hang for up to 10 seconds. The `adaptive_timeout` combined with the multiple concurrent workers helps mitigate the overall slowdown caused by a few unresponsive nodes since the other queries can be issues concurrently. """ network.logger.debug2("Recursive find nodes: %s", target.hex()) start_at = trio.current_time() # The set of NodeID values we have already queried. queried_node_ids: Set[NodeID] = set() # The set of NodeID that timed out # # The `local_node_id` is # included in this as a convenience mechanism so that we don't have to # continually fiter it out of the various filters unresponsive_node_ids: Set[NodeID] = {network.local_node_id} # We maintain a cache of nodes that were recently deemed unresponsive # within the last 10 minutes. unresponsive_node_ids.update( node_id for node_id, last_unresponsive_at in unresponsive_cache.items() if trio.current_time() - last_unresponsive_at < 300 ) # Accumulator of the node_ids we have seen received_node_ids: Set[NodeID] = set() # Tracker for node_ids that are actively being requested. in_flight: Set[NodeID] = set() condition = trio.Condition() def get_unqueried_node_ids() -> Tuple[NodeID, ...]: """ Get the three nodes that are closest to the target such that the node is in the closest `k` nodes which haven't been deemed unresponsive. """ # Construct an iterable of *all* the nodes we know about ordered by # closeness to the target. candidates = iter_closest_nodes( target, network.routing_table, received_node_ids ) # Remove any unresponsive nodes from that iterable responsive_candidates = itertools.filterfalse( lambda node_id: node_id in unresponsive_node_ids, candidates ) # Grab the closest K closest_k_candidates = take( network.routing_table.bucket_size, responsive_candidates, ) # Filter out any from the closest K that we've already queried or that are in-flight closest_k_unqueried = itertools.filterfalse( lambda node_id: node_id in queried_node_ids or node_id in in_flight, closest_k_candidates, ) return tuple(take(3, closest_k_unqueried)) async def do_lookup( node_id: NodeID, send_channel: trio.abc.SendChannel[ENRAPI] ) -> None: """ Perform an individual lookup on the target part of the network from the given `node_id` """ if node_id == target: distance = 0 else: distance = compute_log_distance(node_id, target) try: found_enrs = await network.find_nodes(node_id, distance) except (trio.TooSlowError, MissingEndpointFields, ValidationError): unresponsive_node_ids.add(node_id) unresponsive_cache[node_id] = trio.current_time() return except trio.Cancelled: # We don't add these to the unresponsive cache since they didn't # necessarily exceed the fulle 10s request/response timeout. unresponsive_node_ids.add(node_id) raise for enr in found_enrs: try: network.enr_db.set_enr(enr) except OldSequenceNumber: pass async with condition: new_enrs = tuple( enr for enr in found_enrs if enr.node_id not in received_node_ids ) received_node_ids.update(enr.node_id for enr in new_enrs) for enr in new_enrs: try: await send_channel.send(enr) except (trio.BrokenResourceError, trio.ClosedResourceError): # In the event that the consumer of `recursive_find_nodes` # exits early before the lookup has completed we can end up # operating on a closed channel. return async def worker( worker_id: NodeID, send_channel: trio.abc.SendChannel[ENRAPI] ) -> None: """ Pulls unqueried nodes from the closest k nodes and performs a concurrent lookup on them. """ for round in itertools.count(): async with condition: node_ids = get_unqueried_node_ids() if not node_ids: await condition.wait() continue # Mark the node_ids as having been queried. queried_node_ids.update(node_ids) # Mark the node_ids as being in-flight. in_flight.update(node_ids) # Some of the node ids may have come from our routing table. # These won't be present in the `received_node_ids` so we # detect this here and send them over the channel. try: for node_id in node_ids: if node_id not in received_node_ids: enr = network.enr_db.get_enr(node_id) received_node_ids.add(node_id) await send_channel.send(enr) except (trio.BrokenResourceError, trio.ClosedResourceError): # In the event that the consumer of `recursive_find_nodes` # exits early before the lookup has completed we can end up # operating on a closed channel. return if len(node_ids) == 1: await do_lookup(node_ids[0], send_channel) else: tasks = tuple( (do_lookup, (node_id, send_channel)) for node_id in node_ids ) try: await adaptive_timeout(*tasks, threshold=1, variance=2.0) except trio.TooSlowError: pass async with condition: # Remove the `node_ids` from the in_flight set. in_flight.difference_update(node_ids) condition.notify_all() async def _monitor_done(send_channel: trio.abc.SendChannel[ENRAPI]) -> None: async with send_channel: async with condition: while True: # this `fail_after` is a failsafe to prevent deadlock situations # which are possible with `Condition` objects. with trio.move_on_after(60) as scope: node_ids = get_unqueried_node_ids() if not node_ids and not in_flight: break else: await condition.wait() if scope.cancelled_caught: network.logger.error("Deadlock") send_channel, receive_channel = trio.open_memory_channel[ENRAPI](256) async with trio.open_nursery() as nursery: nursery.start_soon(_monitor_done, send_channel) for worker_id in range(concurrency): nursery.start_soon(worker, worker_id, send_channel) async with receive_channel: yield receive_channel nursery.cancel_scope.cancel() elapsed = trio.current_time() - start_at network.logger.debug( "Lookup for %s finished in %f seconds: seen=%d queried=%d unresponsive=%d", target.hex(), elapsed, len(received_node_ids), len(queried_node_ids), len(unresponsive_node_ids), )