Beispiel #1
0
    async def parse_item(
        self, request: Request, response: Response, *args, **kwargs
    ) -> AsyncGeneratorType:
        logger.info("Parsing: Feed %s", response.url)

        if "parse_type" not in kwargs:
            raise ValueError("type keyword argument is required")

        parse_type = kwargs["parse_type"]

        content_type = create_content_type(
            parse_type,
            response.encoding,
            response.headers.get(hdrs.CONTENT_TYPE, "").lower(),
        )

        item = FeedInfo(url=response.url, content_type=content_type)

        # Check link headers first for WebSub content discovery
        # https://www.w3.org/TR/websub/#discovery
        if response.headers:
            item.hubs, item.self_url = self.header_links(response.headers)

        try:
            valid_feed = False

            if parse_type == ParseTypes.JSON:
                valid_feed = self.parse_json(item, response.json)
            elif parse_type == ParseTypes.XML:
                valid_feed = self.parse_xml(
                    item,
                    response.data,
                    response.encoding,
                    headers_to_dict(response.headers),
                )

            if not valid_feed:
                logger.debug("Invalid Feed: %s", item)
                return
        except Exception as e:
            logger.exception("Failed to parse feed %s, Error: %s", item, e)
            return

        if item.favicon and self.crawler.favicon_data_uri:
            favicon = Favicon(
                url=item.favicon,
                priority=1,
            )
            yield self.follow(
                item.favicon,
                self.crawler.parse_favicon_data_uri,
                cb_kwargs=dict(favicon=favicon),
            )

        self.validate_self_url(item)

        item.content_length = response.content_length
        self.score_item(item, response.history[0])
        yield item
Beispiel #2
0
    def parse_json(self, item: FeedInfo, data: dict) -> bool:
        """
        Get info from JSON feed.

        :param item: FeedInfo object
        :param data: JSON object
        :return: None
        """
        item.version = data.get("version")
        if "https://jsonfeed.org/version/" not in item.version:
            item.bozo = 1
            return False

        if not data.get("items"):
            return False

        item.title = data.get("title")
        item.description = data.get("description")

        favicon = data.get("favicon")
        if favicon:
            item.favicon = URL(favicon)

        # Only search if no hubs already present from headers
        if not item.hubs:
            try:
                item.hubs = list(hub.get("url") for hub in data.get("hubs", []))
            except (IndexError, AttributeError):
                pass

        if item.hubs:
            item.is_push = True

        try:
            dates = []
            now_date: date = datetime.utcnow().date()

            entries = data.get("items", [])
            item.item_count = len(entries)

            dates.extend(
                FeedInfoParser.entry_dates(
                    entries, ["date_modified", "date_published"], now_date
                )
            )

            if dates:
                item.last_updated = sorted(dates, reverse=True)[0]
                item.velocity = self.entry_velocity(dates)
        except Exception as e:
            logger.exception("Unable to get feed published date: %s", e)
            pass

        return True
Beispiel #3
0
    def score_item(item: FeedInfo, original_url: URL):
        score = 0

        url_str = str(item.url).lower()

        # -- Score Decrement --

        if original_url:
            host = remove_www(original_url.host)

            if host not in item.url.host:
                score -= 20

        # Decrement the score by every extra path in the url
        parts_len = len(item.url.parts)
        if parts_len > 2:
            score -= (parts_len - 2) * 2

        if item.bozo:
            score -= 20
        if not item.description:
            score -= 10
        if "georss" in url_str:
            score -= 10
        if "alt" in url_str:
            score -= 7
        if "comments" in url_str or "comments" in item.title.lower():
            score -= 15
        if "feedburner" in url_str:
            score -= 10

        # -- Score Increment --
        if item.url.scheme == "https":
            score += 10
        if item.is_push:
            score += 10
        if "index" in url_str:
            score += 30

        if "comments" in url_str or "comments" in item.title.lower():
            score -= 15
        else:
            score += int(item.velocity)

        if any(map(url_str.count, ["/home", "/top", "/most", "/magazine"])):
            score += 10

        kw = ["atom", "rss", ".xml", "feed", "rdf"]
        for p, t in zip(range(len(kw) * 2, 0, -2), kw):
            if t in url_str:
                score += p

        item.score = score
Beispiel #4
0
    def validate_self_url(item: FeedInfo) -> None:
        """
        Validate the self url

        :param item: FeedInfo item
        """
        try:
            item.self_url = URL(item.self_url)
        except ValueError:
            item.self_url = ""
            return

        if item.self_url and item.self_url != item.url:
            # Handle a case where the item url contains a trailing slash and the self url doesn't.
            if str(item.url).strip("/") == str(item.self_url):
                item.url = URL(str(item.url).strip("/"))
                return

            # The self url should be an absolute url.
            if not item.self_url.is_absolute():
                if str(item.self_url) in str(item.url):
                    item.self_url = item.url
                else:
                    item.self_url = ""
Beispiel #5
0
    def parse_xml(
        self, item: FeedInfo, data: Union[str, bytes], encoding: str, headers: Dict
    ) -> bool:
        """
        Get info from XML (RSS or ATOM) feed.
        """

        # Parse data with feedparser
        try:
            parsed = self.parse_raw_data(data, encoding, headers)
        except Exception as e:
            logger.exception("Unable to parse feed %s: %s", item, e)
            return False

        if not parsed:
            logger.warning("No valid feed data for %s", item)
            return False

        if parsed.get("bozo") == 1:
            bozo_exception = parsed.get("bozo_exception", None)
            if isinstance(bozo_exception, feedparser.CharacterEncodingOverride):
                item.bozo = 1
            elif isinstance(
                bozo_exception,
                (feedparser.CharacterEncodingUnknown, feedparser.UndeclaredNamespace),
            ):
                logger.warning("No valid feed data for %s: %s", item, bozo_exception)
                return False

        feed = parsed.get("feed")
        if not feed:
            return False
        if not parsed.get("entries"):
            return False

        # Only search if no hubs already present from headers
        if not item.hubs:
            item.hubs, item.self_url = self.websub_links(feed)

        if item.hubs and item.self_url:
            item.is_push = True

        item.version = parsed.get("version")
        item.title = self.feed_title(feed)
        item.description = self.feed_description(feed)
        item.is_podcast = self.is_podcast(parsed)

        try:
            dates = []
            now_date = datetime.utcnow().date()

            entries = parsed.get("entries", [])
            item.item_count = len(entries)

            dates.extend(
                FeedInfoParser.entry_dates(entries, ["updated", "published"], now_date)
            )

            if dates:
                item.last_updated = sorted(dates, reverse=True)[0]
                item.velocity = self.entry_velocity(dates)
            elif feed.get("updated"):
                item.last_updated = datestring_to_utc_datetime(feed.get("updated"))
        except Exception as e:
            logger.exception("Unable to get feed published date: %s", e)
            pass

        return True