Esempio n. 1
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href","src"), tags=("a","area","img"), deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
 def test_deny_extensions(self):
     html = """<a href="page.html">asd</a> and <a href="photo.jpg">"""
     response = HtmlResponse("http://example.org/", body=html)
     lx = SgmlLinkExtractor(deny_extensions="jpg")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])
Esempio n. 3
0
    def test_attrs(self):
        lx = self.extractor_cls(attrs="href")
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=("href", "src"),
                                tags=("a", "area", "img"),
                                deny_extensions=())
        self.assertEqual(lx.extract_links(self.response), [
            Link(url='http://example.com/sample1.html', text=u''),
            Link(url='http://example.com/sample2.html', text=u'sample 2'),
            Link(url='http://example.com/sample2.jpg', text=u''),
            Link(url='http://example.com/sample3.html', text=u'sample 3 text'),
            Link(url='http://www.google.com/something', text=u''),
            Link(url='http://example.com/innertag.html', text=u'inner tag'),
        ])

        lx = self.extractor_cls(attrs=None)
        self.assertEqual(lx.extract_links(self.response), [])

        html = """<html><area href="sample1.html"></area><a ref="sample2.html">sample text 2</a></html>"""
        response = HtmlResponse("http://example.com/index.html", body=html)
        lx = SgmlLinkExtractor(attrs=("href"))
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://example.com/sample1.html', text=u''),
        ])
 def test_attrs_sgml(self):
     html = """<html><area href="sample1.html"></area>
     <a ref="sample2.html">sample text 2</a></html>"""
     response = HtmlResponse("http://example.com/index.html", body=html)
     lx = SgmlLinkExtractor(attrs="href")
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.com/sample1.html', text=u''),
     ])
Esempio n. 5
0
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual([link for link in lx.extract_links(response)], [
         Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True),
         Link(url='http://example.org/about.html', text=u'About us', nofollow=False),
     ])
 def test_link_nofollow(self):
     html = """
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     <a href="http://google.com/something" rel="external nofollow">Something</a>
     """
     response = HtmlResponse("http://example.org/page.html", body=html)
     lx = SgmlLinkExtractor()
     self.assertEqual(
         [link for link in lx.extract_links(response)],
         [
             Link(url="http://example.org/page.html?action=print", text=u"Printer-friendly page", nofollow=True),
             Link(url="http://example.org/about.html", text=u"About us", nofollow=False),
             Link(url="http://google.com/something", text=u"Something", nofollow=True),
         ],
     )
Esempio n. 7
0
 def parse(self,response):
     link_ex = SgmlLinkExtractor(allow=(r'https://movie.douban.com/subject/\d+'))
     for i in link_ex.extract_links(response):
         yield Request(i.url,callback=self.parse_item,headers=self.headers)