def test_extraction(self):
     # Default arguments
     lx = RegexLinkExtractor()
     self.assertEqual(lx.extract_links(self.response),
                      [Link(url='http://example.com/sample2.html', text=u'sample 2'),
                       Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
                       Link(url='http://www.google.com/something', text=u''),
                       Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 def test_extraction(self):
     # Default arguments
     lx = RegexLinkExtractor()
     self.assertEqual(lx.extract_links(self.response),
                      [Link(url='http://example.com/sample2.html', text=u'sample 2'),
                       Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
                       Link(url='http://www.google.com/something', text=u''),
                       Link(url='http://example.com/innertag.html', text=u'inner tag'),])
 def test_link_wrong_href(self):
     html = """
     <a href="http://example.org/item1.html">Item 1</a>
     <a href="http://[example.org/item2.html">Item 2</a>
     <a href="http://example.org/item3.html">Item 3</a>
     """
     response = HtmlResponse("http://example.org/index.html", body=html)
     lx = RegexLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
         Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
     ])
 def test_link_wrong_href(self):
     html = """
     <a href="http://example.org/item1.html">Item 1</a>
     <a href="http://[example.org/item2.html">Item 2</a>
     <a href="http://example.org/item3.html">Item 3</a>
     """
     response = HtmlResponse("http://example.org/index.html", body=html)
     lx = RegexLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/item1.html', text=u'Item 1', nofollow=False),
         Link(url='http://example.org/item3.html', text=u'Item 3', nofollow=False),
     ])
 def test_extraction(self):
     # Default arguments
     lx = RegexLinkExtractor()
     self.assertEqual(
         lx.extract_links(self.response),
         [
             Link(url="http://example.com/sample2.html", text=u"sample 2"),
             Link(url="http://example.com/sample3.html", text=u"sample 3 text"),
             Link(url="http://www.google.com/something", text=u""),
             Link(url="http://example.com/innertag.html", text=u"inner tag"),
         ],
     )
 def test_html_base_href(self):
     html = """
     <html>
         <head>
             <base href="http://b.com/">
         </head>
         <body>
             <a href="test.html"></a>
         </body>
     </html>
     """
     response = HtmlResponse("http://a.com/", body=html)
     lx = RegexLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://b.com/test.html', text=u'', nofollow=False),
     ])
class MySpider(CrawlSpider):
    name = 'recorder'
    start_urls = [
        'http://' + DOMAIN,
    ]
    allowed_domains = [DOMAIN]

    rules = [Rule(FallbackLinkExtractor([
        LinkExtractor(allow=ALLOWED_RE),
        RegexLinkExtractor(allow=ALLOWED_RE),
    ]), callback='parse_page', follow=True)]

    def parse_page(self, response):
        pass