def test_extraction(self): # Default arguments lx = RegexLinkExtractor() self.assertEqual(lx.extract_links(self.response), [Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'),])
def test_link_wrong_href(self): html = """ <a href="http://example.org/item1.html">Item 1</a> <a href="http://[example.org/item2.html">Item 2</a> <a href="http://example.org/item3.html">Item 3</a> """ response = HtmlResponse("http://example.org/index.html", body=html) lx = RegexLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), ])
class MySpider(CrawlSpider): name = 'recorder' start_urls = [ 'http://' + DOMAIN, ] allowed_domains = [DOMAIN] rules = [Rule(FallbackLinkExtractor([ LinkExtractor(allow=ALLOWED_RE), RegexLinkExtractor(allow=ALLOWED_RE), ]), callback='parse_page', follow=True)] def parse_page(self, response): pass
def test_html_base_href(self): html = """ <html> <head> <base href="http://b.com/"> </head> <body> <a href="test.html"></a> </body> </html> """ response = HtmlResponse("http://a.com/", body=html) lx = RegexLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://b.com/test.html', text=u'', nofollow=False), ])