def test_roots(self): crawler = crawling.Crawler(['http://a', 'http://b', 'not-a-host'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://a/a", crawler.root_domains)) self.assertTrue(verify.url_allowed("http://b/b", crawler.root_domains)) self.assertFalse(verify.url_allowed("http://c/c", crawler.root_domains)) self.assertFalse(verify.url_allowed("http://127.0.0.1", crawler.root_domains))
def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info( "got %r urls from %r new links: %i visited: %i", len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls), ) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic( url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls), ) return new_links
def parse_links(self, web_page_html, base_url, _content_type, _encoding): """Return a list of links.""" links = set() tree = html.fromstring(web_page_html) tree.make_links_absolute(base_url) urls = [link[2] for link in tree.iterlinks()] for url in urls: defragmented, frag = urllib.parse.urldefrag(url) if verify.url_allowed( defragmented, self.root_domains, exclude=self.exclude ): # Select Valid links, testing against regexp and root_domains links.add(defragmented) if urls: LOGGER.info('got %r urls from %r new links: %i visited: %i', len(urls), base_url, len(links - self.seen_urls), len(self.seen_urls)) new_links = [link for link in links.difference(self.seen_urls)] self.record_statistic(url=base_url, content_type=_content_type, encoding=_encoding, num_urls=len(links), num_new_urls=len(links - self.seen_urls)) return new_links
def test_deep_root(self): # Make sure 'a' is a root domain if the root is a link deep in 'a'. crawler = crawling.Crawler(['http://a/a#fragment'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://a/b", crawler.root_domains))
def test_exclude(self): crawler = crawling.Crawler(['http://example.com'], exclude=r'.*pattern', loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://example.com", crawler.root_domains, exclude=crawler.exclude)) self.assertFalse(verify.url_allowed("http://example.com/pattern", crawler.root_domains, exclude=crawler.exclude))
def test_lenient_host_checking(self): crawler = crawling.Crawler(['http://example.com'], strict=False, loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://www.example.com", crawler.root_domains, strict=False)) self.assertTrue(verify.url_allowed("http://foo.example.com", crawler.root_domains, strict=False))