def create_crawler(self, urls=None, *args, **kwargs): if self.crawler: self.crawler.close() if urls is None: urls = [self.app_url] self.crawler = crawling.Crawler(urls, *args, loop=self.loop, **kwargs) self.addCleanup(self.crawler.close)
def test_roots(self): crawler = crawling.Crawler(['http://a', 'http://b', 'not-a-host'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://a/a", crawler.root_domains)) self.assertTrue(verify.url_allowed("http://b/b", crawler.root_domains)) self.assertFalse(verify.url_allowed("http://c/c", crawler.root_domains)) self.assertFalse(verify.url_allowed("http://127.0.0.1", crawler.root_domains))
def test_deep_root(self): # Make sure 'a' is a root domain if the root is a link deep in 'a'. crawler = crawling.Crawler(['http://a/a#fragment'], loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://a/b", crawler.root_domains))
def test_exclude(self): crawler = crawling.Crawler(['http://example.com'], exclude=r'.*pattern', loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://example.com", crawler.root_domains, exclude=crawler.exclude)) self.assertFalse(verify.url_allowed("http://example.com/pattern", crawler.root_domains, exclude=crawler.exclude))
def test_lenient_host_checking(self): crawler = crawling.Crawler(['http://example.com'], strict=False, loop=self.loop) self.addCleanup(crawler.close) self.assertTrue(verify.url_allowed("http://www.example.com", crawler.root_domains, strict=False)) self.assertTrue(verify.url_allowed("http://foo.example.com", crawler.root_domains, strict=False))