def test_extraction(self): # Default arguments lx = RegexLinkExtractor() self.assertEqual(lx.extract_links(self.response), [Link(url='http://example.com/sample2.html', text=u'sample 2'), Link(url='http://example.com/sample3.html', text=u'sample 3 text'), Link(url='http://www.google.com/something', text=u''), Link(url='http://example.com/innertag.html', text=u'inner tag'),])
def test_link_wrong_href(self): html = """ <a href="http://example.org/item1.html">Item 1</a> <a href="http://[example.org/item2.html">Item 2</a> <a href="http://example.org/item3.html">Item 3</a> """ response = HtmlResponse("http://example.org/index.html", body=html) lx = RegexLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False), Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False), ])
def test_extraction(self): # Default arguments lx = RegexLinkExtractor() self.assertEqual( lx.extract_links(self.response), [ Link(url="http://example.com/sample2.html", text=u"sample 2"), Link(url="http://example.com/sample3.html", text=u"sample 3 text"), Link(url="http://www.google.com/something", text=u""), Link(url="http://example.com/innertag.html", text=u"inner tag"), ], )
def test_html_base_href(self): html = """ <html> <head> <base href="http://b.com/"> </head> <body> <a href="test.html"></a> </body> </html> """ response = HtmlResponse("http://a.com/", body=html) lx = RegexLinkExtractor() self.assertEqual([link for link in lx.extract_links(response)], [ Link(url='http://b.com/test.html', text=u'', nofollow=False), ])