Ejemplo n.º 1
0
def test_fetch_feed_no_new_content(session, capture_signal_notification_received):
    feed = Feed(topic="http://test.com/feed.rss")

    body = "<rss></rss>"
    feed.new_content(body)

    responses.add(
        responses.GET,
        feed.topic,
        body=body,
        status=200,
        content_type="application/rss+xml",
    )

    result = FeedFetcher.fetch_feed(feed)
    assert result is False
    assert feed.last_status_code == 200
    assert not feed.etag
    assert not feed.last_modified
    assert feed.next_scheduled_update > Feed.current_timestamp()

    assert len(capture_signal_notification_received) == 0

    stats = FeedStats.query.filter_by(feed_id=feed.id).first()
    assert stats
    assert stats.status_code == 200
    assert stats.new_entries == 0
    assert stats.updated_entries == 0
    assert stats.type == FetchType.PULL
    assert stats.latency > 0
    assert stats.parse_time == 0
Ejemplo n.º 2
0
    def fetch_feed(cls, feed: Feed, force: bool = False) -> bool:
        """
        Fetch a single Feed.

        Will attempt to fetch latest version of Feed. Statistics will be saved and next fetch
        scheduled regardless of fetch success. If fetch failed or there is no change in the feed, then
        no notification_received signal will be sent and the process will exit.

        :param feed: Feed to be fetched
        :param force: Always fetch regardless of last-modified values
        :return: True if fetch was successful with new content
        """
        fetched: bool = False

        content: str = ""
        content_length: int = 0
        response_url: str = ""
        headers: dict = {}
        status_code: int = 500
        encoding: str = ""

        app.logger.info("Fetching %s", feed)

        start = time.perf_counter()
        try:
            with requests.get(
                feed.topic,
                headers=feed.fetch_headers(force),
                timeout=(3.05, 10),
                stream=True,
            ) as response:

                # Capture response variables before raising any exceptions
                status_code = response.status_code
                headers = response.headers
                encoding = response.encoding or "utf-8"

                # Only set content if content length is acceptable, else raise ContentLengthException
                content_length = int(response.headers.get("Content-Length", 0))
                if content_length < app.config.get("MAX_FEED_LENGTH"):
                    content = response.text
                else:
                    raise ContentLengthException()

                response.raise_for_status()

        except requests.Timeout as t:
            app.logger.warning("Timeout fetching Feed %s: %s", feed, t)
        except requests.ConnectionError as c:
            app.logger.warning("ConnectionError fetching feed %s: %s", feed, c)
        except requests.RequestException as e:
            app.logger.warning("Error fetching Feed %s: %s", feed, e)
        except ContentLengthException:
            app.logger.warning("TOO BIG: feed=%s size=%s", feed, content_length)
        else:
            fetched = True

        request_time_ms = int((time.perf_counter() - start) * 1000)

        # Read content length from content only if available and required
        if content and content_length == 0:
            content_length = utf8len(content)

        app.logger.info(
            "FETCHED: topic=%s duration=%dms status=%s size=%sb",
            feed.topic,
            request_time_ms,
            status_code,
            content_length,
        )

        feed.last_status_code = status_code
        feed.last_fetch = datetime.utcnow()
        feed.set_next_scheduled_update()

        stats = FeedStats.create_stats(
            feed.id,
            FetchType.PULL,
            status_code=status_code,
            latency=request_time_ms,
            content_length=content_length,
        )

        # Set feed to inactive if Feed is Gone
        if status_code == 410:
            feed.gone = True

        # Exit if Feed not successfully fetched
        if not fetched:
            return cls.no_change(feed, stats)

        # Set Feed ETag from response
        # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag
        etag = headers.get("etag")
        if etag:
            feed.etag = etag

        # Set Feed Last-Modified from response
        # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
        last_modified = headers.get("last-modified")
        if last_modified:
            feed.last_modified = feed.set_last_modified(last_modified)

        # Save and return if not modified code received
        if status_code == 304:
            return cls.no_change(feed, stats)

        # Check if content hash is new
        new_content = feed.new_content(content, encoding or "UTF-8")

        # Save and return if no new content and not forcing update
        if not new_content and not force:
            return cls.no_change(feed, stats)

        db.session.commit()

        # Set content-location header to final url if not already set
        if "content-location" not in headers:
            headers["content-location"] = response_url

        notification_received.send(
            cls,
            feed=feed,
            content_type=parse_options_header(headers.get("content-type")),
            content=content,
            encoding=encoding,
            stats=stats,
            headers=headers_to_dict(headers),
        )
        return True