Beispiel #1
0
 def test_roots(self):
     crawler = crawling.Crawler(['http://a', 'http://b', 'not-a-host'],
                                loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(verify.url_allowed("http://a/a", crawler.root_domains))
     self.assertTrue(verify.url_allowed("http://b/b", crawler.root_domains))
     self.assertFalse(verify.url_allowed("http://c/c", crawler.root_domains))
     self.assertFalse(verify.url_allowed("http://127.0.0.1", crawler.root_domains))
Beispiel #2
0
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info(
                "got %r urls from %r new links: %i visited: %i",
                len(urls),
                base_url,
                len(links - self.seen_urls),
                len(self.seen_urls),
            )
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(
            url=base_url,
            content_type=_content_type,
            encoding=_encoding,
            num_urls=len(links),
            num_new_urls=len(links - self.seen_urls),
        )
        return new_links
Beispiel #3
0
    def parse_links(self, web_page_html, base_url, _content_type, _encoding):
        """Return a list of links."""
        links = set()
        tree = html.fromstring(web_page_html)
        tree.make_links_absolute(base_url)
        urls = [link[2] for link in tree.iterlinks()]
        for url in urls:
            defragmented, frag = urllib.parse.urldefrag(url)
            if verify.url_allowed(
                    defragmented, self.root_domains, exclude=self.exclude
            ):  # Select Valid links, testing against regexp and root_domains
                links.add(defragmented)
        if urls:
            LOGGER.info('got %r urls from %r new links: %i visited: %i',
                        len(urls), base_url, len(links - self.seen_urls),
                        len(self.seen_urls))
        new_links = [link for link in links.difference(self.seen_urls)]

        self.record_statistic(url=base_url,
                              content_type=_content_type,
                              encoding=_encoding,
                              num_urls=len(links),
                              num_new_urls=len(links - self.seen_urls))
        return new_links
Beispiel #4
0
 def test_deep_root(self):
     # Make sure 'a' is a root domain if the root is a link deep in 'a'.
     crawler = crawling.Crawler(['http://a/a#fragment'], loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(verify.url_allowed("http://a/b", crawler.root_domains))
Beispiel #5
0
 def test_exclude(self):
     crawler = crawling.Crawler(['http://example.com'],
                                exclude=r'.*pattern', loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(verify.url_allowed("http://example.com", crawler.root_domains, exclude=crawler.exclude))
     self.assertFalse(verify.url_allowed("http://example.com/pattern", crawler.root_domains, exclude=crawler.exclude))
Beispiel #6
0
 def test_lenient_host_checking(self):
     crawler = crawling.Crawler(['http://example.com'], strict=False,
                                loop=self.loop)
     self.addCleanup(crawler.close)
     self.assertTrue(verify.url_allowed("http://www.example.com", crawler.root_domains, strict=False))
     self.assertTrue(verify.url_allowed("http://foo.example.com", crawler.root_domains, strict=False))