Example #1
0
async def _expire_old_entries(clock: Clock, expiry_seconds: int) -> None:
    """Walks the global cache list to find cache entries that haven't been
    accessed in the given number of seconds.
    """

    now = int(clock.time())
    node = GLOBAL_ROOT.prev_node
    assert node is not None

    i = 0

    logger.debug("Searching for stale caches")

    while node is not GLOBAL_ROOT:
        # Only the root node isn't a `_TimedListNode`.
        assert isinstance(node, _TimedListNode)

        if node.last_access_ts_secs > now - expiry_seconds:
            break

        cache_entry = node.get_cache_entry()
        next_node = node.prev_node

        # The node should always have a reference to a cache entry and a valid
        # `prev_node`, as we only drop them when we remove the node from the
        # list.
        assert next_node is not None
        assert cache_entry is not None
        cache_entry.drop_from_cache()

        # If we do lots of work at once we yield to allow other stuff to happen.
        if (i + 1) % 10000 == 0:
            logger.debug("Waiting during drop")
            await clock.sleep(0)
            logger.debug("Waking during drop")

        node = next_node

        # If we've yielded then our current node may have been evicted, so we
        # need to check that its still valid.
        if node.prev_node is None:
            break

        i += 1

    logger.info("Dropped %d items from caches", i)
Example #2
0
def _create_event_id(clock: Clock, hostname: str) -> str:
    """Create a new event ID

    Args:
        clock
        hostname: The server name for the event ID

    Returns:
        The new event ID
    """

    global _event_id_counter

    i = str(_event_id_counter)
    _event_id_counter += 1

    local_part = str(int(clock.time())) + i + random_string(5)

    e_id = EventID(local_part, hostname)

    return e_id.to_string()
Example #3
0
 def update_last_access(self, clock: Clock) -> None:
     self.last_access_ts_secs = int(clock.time())
Example #4
0
async def _expire_old_entries(clock: Clock, expiry_seconds: int,
                              autotune_config: Optional[dict]) -> None:
    """Walks the global cache list to find cache entries that haven't been
    accessed in the given number of seconds, or if a given memory threshold has been breached.
    """
    if autotune_config:
        max_cache_memory_usage = autotune_config["max_cache_memory_usage"]
        target_cache_memory_usage = autotune_config[
            "target_cache_memory_usage"]
        min_cache_ttl = autotune_config["min_cache_ttl"] / 1000

    now = int(clock.time())
    node = GLOBAL_ROOT.prev_node
    assert node is not None

    i = 0

    logger.debug("Searching for stale caches")

    evicting_due_to_memory = False

    # determine if we're evicting due to memory
    jemalloc_interface = get_jemalloc_stats()
    if jemalloc_interface and autotune_config:
        try:
            jemalloc_interface.refresh_stats()
            mem_usage = jemalloc_interface.get_stat("allocated")
            if mem_usage > max_cache_memory_usage:
                logger.info("Begin memory-based cache eviction.")
                evicting_due_to_memory = True
        except Exception:
            logger.warning(
                "Unable to read allocated memory, skipping memory-based cache eviction."
            )

    while node is not GLOBAL_ROOT:
        # Only the root node isn't a `_TimedListNode`.
        assert isinstance(node, _TimedListNode)

        # if node has not aged past expiry_seconds and we are not evicting due to memory usage, there's
        # nothing to do here
        if (node.last_access_ts_secs > now - expiry_seconds
                and not evicting_due_to_memory):
            break

        # if entry is newer than min_cache_entry_ttl then do not evict and don't evict anything newer
        if evicting_due_to_memory and now - node.last_access_ts_secs < min_cache_ttl:
            break

        cache_entry = node.get_cache_entry()
        next_node = node.prev_node

        # The node should always have a reference to a cache entry and a valid
        # `prev_node`, as we only drop them when we remove the node from the
        # list.
        assert next_node is not None
        assert cache_entry is not None
        cache_entry.drop_from_cache()

        # Check mem allocation periodically if we are evicting a bunch of caches
        if jemalloc_interface and evicting_due_to_memory and (i +
                                                              1) % 100 == 0:
            try:
                jemalloc_interface.refresh_stats()
                mem_usage = jemalloc_interface.get_stat("allocated")
                if mem_usage < target_cache_memory_usage:
                    evicting_due_to_memory = False
                    logger.info("Stop memory-based cache eviction.")
            except Exception:
                logger.warning(
                    "Unable to read allocated memory, this may affect memory-based cache eviction."
                )
                # If we've failed to read the current memory usage then we
                # should stop trying to evict based on memory usage
                evicting_due_to_memory = False

        # If we do lots of work at once we yield to allow other stuff to happen.
        if (i + 1) % 10000 == 0:
            logger.debug("Waiting during drop")
            if node.last_access_ts_secs > now - expiry_seconds:
                await clock.sleep(0.5)
            else:
                await clock.sleep(0)
            logger.debug("Waking during drop")

        node = next_node

        # If we've yielded then our current node may have been evicted, so we
        # need to check that its still valid.
        if node.prev_node is None:
            break

        i += 1

    logger.info("Dropped %d items from caches", i)
Example #5
0
class WellKnownResolver:
    """Handles well-known lookups for matrix servers."""
    def __init__(
        self,
        reactor: IReactorTime,
        agent: IAgent,
        user_agent: bytes,
        well_known_cache: Optional[TTLCache] = None,
        had_well_known_cache: Optional[TTLCache] = None,
    ):
        self._reactor = reactor
        self._clock = Clock(reactor)

        if well_known_cache is None:
            well_known_cache = _well_known_cache

        if had_well_known_cache is None:
            had_well_known_cache = _had_valid_well_known_cache

        self._well_known_cache = well_known_cache
        self._had_valid_well_known_cache = had_well_known_cache
        self._well_known_agent = RedirectAgent(agent)
        self.user_agent = user_agent

    async def get_well_known(self,
                             server_name: bytes) -> WellKnownLookupResult:
        """Attempt to fetch and parse a .well-known file for the given server

        Args:
            server_name: name of the server, from the requested url

        Returns:
            The result of the lookup
        """
        try:
            prev_result, expiry, ttl = self._well_known_cache.get_with_expiry(
                server_name)

            now = self._clock.time()
            if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl:
                return WellKnownLookupResult(delegated_server=prev_result)
        except KeyError:
            prev_result = None

        # TODO: should we linearise so that we don't end up doing two .well-known
        # requests for the same server in parallel?
        try:
            with Measure(self._clock, "get_well_known"):
                result, cache_period = await self._fetch_well_known(
                    server_name)  # type: Optional[bytes], float

        except _FetchWellKnownFailure as e:
            if prev_result and e.temporary:
                # This is a temporary failure and we have a still valid cached
                # result, so lets return that. Hopefully the next time we ask
                # the remote will be back up again.
                return WellKnownLookupResult(delegated_server=prev_result)

            result = None

            if self._had_valid_well_known_cache.get(server_name, False):
                # We have recently seen a valid well-known record for this
                # server, so we cache the lack of well-known for a shorter time.
                cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD
            else:
                cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD

            # add some randomness to the TTL to avoid a stampeding herd
            cache_period *= random.uniform(
                1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
                1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
            )

        if cache_period > 0:
            self._well_known_cache.set(server_name, result, cache_period)

        return WellKnownLookupResult(delegated_server=result)

    async def _fetch_well_known(self,
                                server_name: bytes) -> Tuple[bytes, float]:
        """Actually fetch and parse a .well-known, without checking the cache

        Args:
            server_name: name of the server, from the requested url

        Raises:
            _FetchWellKnownFailure if we fail to lookup a result

        Returns:
            The lookup result and cache period.
        """

        had_valid_well_known = self._had_valid_well_known_cache.get(
            server_name, False)

        # We do this in two steps to differentiate between possibly transient
        # errors (e.g. can't connect to host, 503 response) and more permanent
        # errors (such as getting a 404 response).
        response, body = await self._make_well_known_request(
            server_name, retry=had_valid_well_known)

        try:
            if response.code != 200:
                raise Exception("Non-200 response %s" % (response.code, ))

            parsed_body = json_decoder.decode(body.decode("utf-8"))
            logger.info("Response from .well-known: %s", parsed_body)

            result = parsed_body["m.server"].encode("ascii")
        except defer.CancelledError:
            # Bail if we've been cancelled
            raise
        except Exception as e:
            logger.info("Error parsing well-known for %s: %s", server_name, e)
            raise _FetchWellKnownFailure(temporary=False)

        cache_period = _cache_period_from_headers(
            response.headers, time_now=self._reactor.seconds)
        if cache_period is None:
            cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD
            # add some randomness to the TTL to avoid a stampeding herd every 24 hours
            # after startup
            cache_period *= random.uniform(
                1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
                1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
            )
        else:
            cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
            cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD)

        # We got a success, mark as such in the cache
        self._had_valid_well_known_cache.set(
            server_name,
            bool(result),
            cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID,
        )

        return result, cache_period

    async def _make_well_known_request(self, server_name: bytes,
                                       retry: bool) -> Tuple[IResponse, bytes]:
        """Make the well known request.

        This will retry the request if requested and it fails (with unable
        to connect or receives a 5xx error).

        Args:
            server_name: name of the server, from the requested url
            retry: Whether to retry the request if it fails.

        Raises:
            _FetchWellKnownFailure if we fail to lookup a result

        Returns:
            Returns the response object and body. Response may be a non-200 response.
        """
        uri = b"https://%s/.well-known/matrix/server" % (server_name, )
        uri_str = uri.decode("ascii")

        headers = {
            b"User-Agent": [self.user_agent],
        }

        i = 0
        while True:
            i += 1

            logger.info("Fetching %s", uri_str)
            try:
                response = await make_deferred_yieldable(
                    self._well_known_agent.request(b"GET",
                                                   uri,
                                                   headers=Headers(headers)))
                body_stream = BytesIO()
                await make_deferred_yieldable(
                    read_body_with_max_size(response, body_stream,
                                            WELL_KNOWN_MAX_SIZE))
                body = body_stream.getvalue()

                if 500 <= response.code < 600:
                    raise Exception("Non-200 response %s" % (response.code, ))

                return response, body
            except defer.CancelledError:
                # Bail if we've been cancelled
                raise
            except BodyExceededMaxSize:
                # If the well-known file was too large, do not keep attempting
                # to download it, but consider it a temporary error.
                logger.warning(
                    "Requested .well-known file for %s is too large > %r bytes",
                    server_name.decode("ascii"),
                    WELL_KNOWN_MAX_SIZE,
                )
                raise _FetchWellKnownFailure(temporary=True)
            except Exception as e:
                if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS:
                    logger.info("Error fetching %s: %s", uri_str, e)
                    raise _FetchWellKnownFailure(temporary=True)

                logger.info("Error fetching %s: %s. Retrying", uri_str, e)

            # Sleep briefly in the hopes that they come back up
            await self._clock.sleep(0.5)