コード例 #1
0
ファイル: test_utils_url.py プロジェクト: 01-/scrapy
 def test_url_is_from_spider_class_attributes(self):
     class MySpider(Spider):
         name = 'example.com'
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
コード例 #2
0
ファイル: test_utils_url.py プロジェクト: pyarnold/scrapy
 def test_url_is_from_spider(self):
     spider = Spider(name='example.com')
     self.assertTrue(
         url_is_from_spider('http://www.example.com/some/page.html', spider))
     self.assertTrue(
         url_is_from_spider('http://sub.example.com/some/page.html', spider))
     self.assertFalse(
         url_is_from_spider('http://www.example.org/some/page.html', spider))
     self.assertFalse(
         url_is_from_spider('http://www.example.net/some/page.html', spider))
コード例 #3
0
ファイル: test_utils_url.py プロジェクト: pyarnold/scrapy
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(
            name='example.com', allowed_domains=['example.org', 'example.net'])
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.net/some/page.html', spider))
        self.assertFalse(
            url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(
            name='example.com', allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))

        spider = Spider(
            name='example.com', allowed_domains=('example.com', 'example.net'))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html', spider))
コード例 #4
0
    def test_url_is_from_spider_class_attributes(self):
        class MySpider(Spider):
            name = 'example.com'

        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               MySpider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html',
                               MySpider))
        self.assertFalse(
            url_is_from_spider('http://www.example.org/some/page.html',
                               MySpider))
        self.assertFalse(
            url_is_from_spider('http://www.example.net/some/page.html',
                               MySpider))
コード例 #5
0
ファイル: url.py プロジェクト: bright-pan/ebooks
 def process_response(self, request, response, spider):
     if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
         rel_canonical = self._extractor.extract_links(response)
         if rel_canonical:
             rel_canonical = rel_canonical[0].url
             if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                 log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                 return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
     return response
コード例 #6
0
ファイル: canonical.py プロジェクト: nyov/scrapyext
 def process_response(self, request, response, spider):
     if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
         rel_canonical = self._extractor.extract_links(response)
         if rel_canonical:
             rel_canonical = rel_canonical[0].url
             if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                 log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                 return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
     return response
コード例 #7
0
ファイル: test_utils_url.py プロジェクト: lopuhin/scrapy
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name="example.com", allowed_domains=["example.org", "example.net"])
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://example.com/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", spider))
        self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", spider))
        self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", spider))

        spider = Spider(name="example.com", allowed_domains=set(("example.com", "example.net")))
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))

        spider = Spider(name="example.com", allowed_domains=("example.com", "example.net"))
        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
コード例 #8
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net'])
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', spider))
        self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', spider))
        self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))

        spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net'))
        self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
コード例 #9
0
ファイル: spidermanager.py プロジェクト: serkanh/scrapy
 def fromurl(self, url):
     if self.force_domain:
         return self._spiders.get(self.force_domain)
     domain = urlparse.urlparse(url).hostname
     domain = str(domain).replace('www.', '')
     if domain:
         if domain in self._spiders:         # try first locating by domain
             return self._spiders[domain]
         else:                               # else search spider by spider
             plist = self._spiders.values()
             for p in plist:
                 if url_is_from_spider(url, p):
                     return p
コード例 #10
0
     def process_response(self, request, response, spider):
         if isinstance(response, HtmlResponse) and response.body and getattr(spider, 'follow_canonical_links', False):
             rel_canonical = self._extractor.extract_links(response)
             if rel_canonical:
                 rel_canonical = rel_canonical[0].url
                 if rel_canonical != request.url and url_is_from_spider(rel_canonical, spider):
                     log.msg("Redirecting (rel=\"canonical\") to %s from %s" % (rel_canonical, request), level=log.DEBUG, spider=spider)
                     return request.replace(url=rel_canonical, callback=lambda r: r if r.status == 200 else response)
         return response
  
 # Snippet imported from snippets.scrapy.org (which no longer works)
 # author: pablo
 # date  : Aug 27, 2010
コード例 #11
0
 def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
     class MySpider(Spider):
         name = 'example.com'
         allowed_domains = ('example.org', 'example.net')
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://example.com/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://www.example.org/some/page.html', MySpider))
     self.assertTrue(url_is_from_spider('http://www.example.net/some/page.html', MySpider))
     self.assertFalse(url_is_from_spider('http://www.example.us/some/page.html', MySpider))
コード例 #12
0
    def test_url_is_from_spider_with_allowed_domains_class_attributes(self):
        class MySpider(BaseSpider):
            name = "example.com"
            allowed_domains = ["example.org", "example.net"]

        self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://example.com/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://www.example.org/some/page.html", MySpider))
        self.assertTrue(url_is_from_spider("http://www.example.net/some/page.html", MySpider))
        self.assertFalse(url_is_from_spider("http://www.example.us/some/page.html", MySpider))
コード例 #13
0
ファイル: spider.py プロジェクト: 535521469/crawler_sth
 def handles_request(cls, request):
     return url_is_from_spider(request.url, cls)
コード例 #14
0
ファイル: spider.py プロジェクト: ddesign84/scrapy
 def handles_request(cls, request):
     """
     判断当前请求的URL是否属于当前蜘蛛
     allowed_domains
     """
     return url_is_from_spider(request.url, cls)
コード例 #15
0
ファイル: spidermanager.py プロジェクト: kenzouyeh/scrapy
 def find_by_request(self, request):
     """Returns list of spiders names that match the given Request"""
     return [name for name, spider in self._spiders.iteritems()
             if url_is_from_spider(request.url, spider)]
コード例 #16
0
 def test_url_is_from_spider(self):
     spider = Spider(name='example.com')
     self.assertTrue(url_is_from_spider('http://www.example.com/some/page.html', spider))
     self.assertTrue(url_is_from_spider('http://sub.example.com/some/page.html', spider))
     self.assertFalse(url_is_from_spider('http://www.example.org/some/page.html', spider))
     self.assertFalse(url_is_from_spider('http://www.example.net/some/page.html', spider))
コード例 #17
0
 def handles_request(cls, request):
     return url_is_from_spider(request.url, cls)
コード例 #18
0
 def test_url_is_from_spider(self):
     spider = BaseSpider(name="example.com")
     self.assertTrue(url_is_from_spider("http://www.example.com/some/page.html", spider))
     self.assertTrue(url_is_from_spider("http://sub.example.com/some/page.html", spider))
     self.assertFalse(url_is_from_spider("http://www.example.org/some/page.html", spider))
     self.assertFalse(url_is_from_spider("http://www.example.net/some/page.html", spider))