def test_url_is_from_spider_class_attributes(self): class MySpider(Spider): name = 'example.com' self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider)) self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider)) self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider)) self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
def test_url_is_from_spider(self): spider = Spider(name='example.com') self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.net/some/page.html', spider))
def test_url_is_from_spider_with_allowed_domains(self): spider = Spider( name='example.com', allowed_domains=['example.org', 'example.net']) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.net/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.us/some/page.html', spider)) spider = Spider( name='example.com', allowed_domains=set(('example.com', 'example.net'))) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) spider = Spider( name='example.com', allowed_domains=('example.com', 'example.net')) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider))
def test_url_is_from_spider_class_attributes(self): class MySpider(Spider): name = 'example.com' self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', MySpider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', MySpider)) self.assertFalse( url_is_from_spider('http://www.example.org/some/page.html', MySpider)) self.assertFalse( url_is_from_spider('http://www.example.net/some/page.html', MySpider))
def process_response(self, request, response, spider): if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False): rel_canonical = self._extractor.extract_links(response) if rel_canonical: rel_canonical = rel_canonical[0].url if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider): log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider) return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response) return response
def test_url_is_from_spider_with_allowed_domains(self): spider = Spider(name="example.com", allowed_domains=["example.org", "example.net"]) self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider)) self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider)) self.assertTrue(url_is_from_spider("http://example.com/some/page.html", spider)) self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", spider)) self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", spider)) self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", spider)) spider = Spider(name="example.com", allowed_domains=set(("example.com", "example.net"))) self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider)) spider = Spider(name="example.com", allowed_domains=("example.com", "example.net")) self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
def test_url_is_from_spider_with_allowed_domains(self): spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net']) self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider)) self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider)) self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider)) spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net'))) self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider)) spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net')) self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
def fromurl(self, url): if self.force_domain: return self._spiders.get(self.force_domain) domain = urlparse.urlparse(url).hostname domain = str(domain).replace('www.', '') if domain: if domain in self._spiders: # try first locating by domain return self._spiders[domain] else: # else search spider by spider plist = self._spiders.values() for p in plist: if url_is_from_spider(url, p): return p
def process_response(self, request, response, spider): if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False): rel_canonical = self._extractor.extract_links(response) if rel_canonical: rel_canonical = rel_canonical[0].url if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider): log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider) return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response) return response # Snippet imported from snippets.scrapy.org (which no longer works) # author: pablo # date : Aug 27, 2010
def test_url_is_from_spider_with_allowed_domains_class_attributes(self): class MySpider(Spider): name = 'example.com' allowed_domains = ('example.org', 'example.net') self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider)) self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider)) self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider)) self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider)) self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider)) self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
def test_url_is_from_spider_with_allowed_domains_class_attributes(self): class MySpider(BaseSpider): name = "example.com" allowed_domains = ["example.org", "example.net"] self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", MySpider)) self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", MySpider)) self.assertTrue(url_is_from_spider("http://example.com/some/page.html", MySpider)) self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", MySpider)) self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", MySpider)) self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", MySpider))
def handles_request(cls, request): return url_is_from_spider(request.url, cls)
def handles_request(cls, request): """ 判断当前请求的URL是否属于当前蜘蛛 allowed_domains """ return url_is_from_spider(request.url, cls)
def find_by_request(self, request): """Returns list of spiders names that match the given Request""" return [name for name, spider in self._spiders.iteritems() if url_is_from_spider(request.url, spider)]
def test_url_is_from_spider(self): spider = Spider(name='example.com') self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
def test_url_is_from_spider(self): spider = BaseSpider(name="example.com") self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider)) self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider)) self.assertFalse(url_is_from_spider("http://www.example.org/some/page.html", spider)) self.assertFalse(url_is_from_spider("http://www.example.net/some/page.html", spider))