コード例 #1
0
    def should_follow_link(self, link: bs4.Tag) -> Optional[Tuple[URL, int]]:
        """
        Check that the link should be followed if it may contain feed information.

        :param link: Link tag
        :return: boolean
        """
        href: str = link.get("href")
        link_type: str = link.get("type")

        url: URL = parse_href_to_url(self.logger, href)
        if not url:
            return None

        # If the link may have a valid feed type then follow it regardless of the url text.
        if (link_type and any(
                map(link_type.lower().count,
                    ["application/json", "rss", "atom", "rdf"]))
                and "json+oembed" not in link_type):
            # A link with a possible feed type has the highest priority after callbacks.
            return url, 2

        is_feedlike_href: bool = self.is_href_matching(str(url),
                                                       feedlike_regex)
        is_feedlike_querystring: bool = self.is_querystring_matching(
            url, feedlike_regex)

        is_podcast_href: bool = self.is_href_matching(str(url), podcast_regex)
        is_podcast_querystring: bool = self.is_querystring_matching(
            url, podcast_regex)

        is_feedlike_url = is_feedlike_querystring or is_feedlike_href
        is_podcast_url = is_podcast_href or is_podcast_querystring

        if not self.full_crawl and not is_feedlike_url and not is_podcast_url:
            return

        # is_one_jump: bool = self.is_one_jump_from_original_domain(url, self.response)
        # if not is_one_jump:
        #     return

        has_author_info: bool = self.is_href_matching(href, author_regex)
        is_low_priority: bool = self.is_low_priority(href)

        priority: int = Request.priority
        # A low priority url should be fetched last.
        if is_low_priority:
            priority = Request.priority + 2
        # Podcast pages are lower priority than authors or feeds.
        if is_podcast_url:
            priority = 5
        # Potential author info has a medium priority.
        if has_author_info:
            priority = 4
        # A feedlike url has high priority.
        if is_feedlike_url:
            priority = 3

        # Validate the actual URL string.
        follow = (
            # is_one_jump
            not self.has_invalid_contents(href)
            and self.is_valid_filetype(href)
            and not self.has_invalid_querystring(url))
        # If full_crawl then follow all valid URLs regardless of the feedlike quality of the URL.
        # Otherwise only follow URLs if they look like they might contain feed information.
        if follow and (self.full_crawl or is_feedlike_url or is_podcast_href):

            # Remove the querystring unless it may point to a feed.
            if not is_feedlike_querystring:
                url = url.with_query(None)

            return url, priority
コード例 #2
0
    async def follow(
        self,
        url: Union[str, URL],
        callback=None,
        response: Response = None,
        method: str = "GET",
        delay: Union[float, None] = None,
        priority: int = 0,
        allow_domain: bool = False,
        cb_kwargs: Dict = None,
        max_content_length: int = None,
        timeout: float = None,
        retries: int = None,
        **kwargs,
    ) -> Union[Request, None]:
        """
        Follow a URL by creating an HTTP Request.

        If the URL is not absolute then it is joined with the previous Response URL.
        The previous Response history is copied to the Request.

        Before a Request is followed, first check that the Request URL has not already been seen,
        that the max URL depth has not been reached, and that the URI scheme is allowed.

        These checks are performed before the Request is created so that we don't yield multiple requests
        to the same URL to the queue for further processing. We want to stop duplicates and invalid
        requests as early as possible.

        :param url: URL to follow.
        :param callback: Callback method to run if the Request is successful.
        :param response: Previous Response that contained the Request URL.
        :param kwargs: Optional Request keyword arguments. See Request for details.
        :param method: HTTP method for Request.
        :param delay: Optionally override the default delay for the Request.
        :param priority: Optionally override the default priority of the Request.
        :param allow_domain: Optionally override the allowed domains check.
        :param max_content_length: Optionally override the maximum allowed size in bytes of Response body.
        :param retries: Optionally override the number of Request retries.
        :param timeout: Optionally override the Request timeout.
        :param cb_kwargs: Optional Dictionary of keyword arguments to be passed to the callback function.
        :return: Request
        """
        original_url = copy.copy(url)
        if isinstance(url, str):
            url = parse_href_to_url(self.logger, url)

        if not url:
            self.logger.warning("Attempted to follow invalid URL: %s",
                                original_url)
            return

        history = []
        if response:
            # Join the URL to the Response URL if it doesn't contain a domain.
            if not url.is_absolute():
                url = response.origin.join(url)

            # Restrict the depth of the Request chain to the maximum depth.
            # This test happens before the URL duplicate check so that the URL might still be reachable by another path.
            if self.max_depth and len(response.history) >= self.max_depth:
                self.logger.debug("Max Depth of '%d' reached: %s",
                                  self.max_depth, url)
                return

            # Copy the Response history so that it isn't a reference to a mutable object.
            history = copy.deepcopy(response.history)

        # The URL scheme must be in the list of allowed schemes.
        if self.allowed_schemes and url.scheme not in self.allowed_schemes:
            self.logger.debug("URI Scheme '%s' not allowed: %s", url.scheme,
                              url)
            return

        # The URL host must be in the list of allowed domains.
        if not allow_domain and not self.is_allowed_domain(url):
            self.logger.debug("Domain '%s' not allowed: %s", url.host, url)
            return

        # Check if URL is not already seen, and add it to the duplicate filter seen list.
        if await self._duplicate_filter.url_seen(url, method):
            return

        request = Request(
            url=url,
            request_session=self._session,
            history=history,
            callback=callback,
            xml_parser=self.parse_xml,
            max_content_length=max_content_length or self.max_content_length,
            timeout=timeout or self.request_timeout,
            method=method,
            delay=delay if isinstance(delay, float) else self.delay,
            retries=retries or self.max_retries,
            cb_kwargs=cb_kwargs,
            **kwargs,
        )

        # Override the Request priority only if the kwarg is provided.
        if priority:
            request.priority = priority

        return request
コード例 #3
0
    def create_start_urls(self, urls: List[Union[URL, str]]) -> List[URL]:
        """
        Create the start URLs for the crawl from an initial URL. May be overridden.

        :param urls: Initial URLs
        """
        crawl_start_urls: Set[URL] = set()

        for url in urls + self.start_urls:
            if isinstance(url, str):
                if "//" not in url:
                    url = f"//{url}"
                url = parse_href_to_url(self.logger, url)

            if url.scheme.lower() not in ["http", "https"]:
                url = url.with_scheme("http")

            crawl_start_urls.add(url)

        origins = set(url.origin() for url in crawl_start_urls)

        if self.try_urls:
            # Common paths for feeds.
            suffixes = {
                "index.xml",
                "atom.xml",
                "feeds",
                "feeds/default",
                "feed",
                "feed/default",
                "feeds/posts/default",
                "?feed=rss",
                "?feed=atom",
                "?feed=rss2",
                "?feed=rdf",
                "rss",
                "atom",
                "rdf",
                "index.rss",
                "index.rdf",
                "index.atom",
                "data/rss",
                "rss.xml",
                "index.json",
                "about",
                "about/feeds",
                "rss-feeds",
            }

            for origin in origins:
                if isinstance(self.try_urls, list):
                    crawl_start_urls.update(
                        origin.join(URL(suffix)) for suffix in self.try_urls)
                else:
                    crawl_start_urls.update(
                        origin.join(URL(suffix)) for suffix in suffixes)

        # Crawl the origin urls of the start urls for Site metadata.
        if self.crawl_hosts:
            crawl_start_urls.update(origins)

        return list(crawl_start_urls)