def test_extraction_encoding(self):
        body = get_testdata('link_extractor', 'linkextractor_noenc.html')
        response_utf8 = HtmlResponse(
            url='http://example.com/utf8',
            body=body,
            headers={'Content-Type': ['text/html; charset=utf-8']})
        response_noenc = HtmlResponse(url='http://example.com/noenc',
                                      body=body)
        body = get_testdata('link_extractor', 'linkextractor_latin1.html')
        response_latin1 = HtmlResponse(url='http://example.com/latin1',
                                       body=body)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response_utf8), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_noenc), [
            Link(url='http://example.com/sample_%C3%B1.html', text=''),
            Link(url='http://example.com/sample_%E2%82%AC.html',
                 text='sample \xe2\x82\xac text'.decode('utf-8')),
        ])

        self.assertEqual(lx.extract_links(response_latin1), [
            Link(url='http://example.com/sample_%F1.html', text=''),
            Link(url='http://example.com/sample_%E1.html',
                 text='sample \xe1 text'.decode('latin1')),
        ])
    def test_base_url(self):
        html = '''<html><head><title>Page title<title><base href="http://otherdomain.com/base/" />
        <body><p><a href="item/12.html">Item 12</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html',
                                body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertEqual(lx.extract_links(response), [
            Link(url='http://otherdomain.com/base/item/12.html',
                 text='Item 12')
        ])

        # base url is an absolute path and relative to host
        html = '''<html><head><title>Page title<title><base href="/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html',
                                body=html)
        self.assertEqual(
            lx.extract_links(response),
            [Link(url='https://example.org/item/12.html', text='Item 12')])

        # base url has no scheme
        html = '''<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" />
        <body><p><a href="item/12.html">Item 12</a></p></body></html>'''
        response = HtmlResponse('https://example.org/somepage/index.html',
                                body=html)
        self.assertEqual(lx.extract_links(response), [
            Link(url='https://noschemedomain.com/path/to/item/12.html',
                 text='Item 12')
        ])
 def test_nothing(self):
     body = '''<html><head></head><body></body></html>'''
     req = Request('http://a.com')
     rsp = HtmlResponse(req.url, body=body, request=req)
     rsp2 = self.mw.process_response(rsp)
     self.assertIs(rsp, rsp2)
     self.assertNotIn('canonical_url', rsp.meta)
 def test_deny_extensions(self):
     html = '''<a href="page.html">asd</a> and <a href="photo.jpg">'''
     response = HtmlResponse('http://example.org/', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html', text=u'asd'),
     ])
    def test_selector_nested(self):
        '''Nested selector tests.'''
        body = '''<body>
                    <div class='one'>
                      <ul>
                        <li>one</li><li>two</li>
                      </ul>
                    </div>
                    <div class='two'>
                      <ul>
                        <li>four</li><li>five</li><li>six</li>
                      </ul>
                    </div>
                  </body>'''

        response = HtmlResponse(url='http://example.com"', body=body)
        x = self.hxs_cls(response)

        divtwo = x.select('//div[@class="two"]')
        self.assertEqual(map(unicode.strip,
                             divtwo.select('//li').extract()),
                         [
                             '<li>one</li>', '<li>two</li>', '<li>four</li>',
                             '<li>five</li>', '<li>six</li>'
                         ])
        self.assertEqual(
            map(unicode.strip,
                divtwo.select('./ul/li').extract()),
            ['<li>four</li>', '<li>five</li>', '<li>six</li>'])
        self.assertEqual(map(unicode.strip,
                             divtwo.select('.//li').extract()),
                         ['<li>four</li>', '<li>five</li>', '<li>six</li>'])
        self.assertEqual(divtwo.select('./li').extract(), [])
 def test_meta_refresh_with_high_interval(self):
     # meta-refresh with high intervals don't trigger redirects
     req = Request(url='http://example.org')
     rsp = HtmlResponse(url='http://example.org',
                        body=self._body(interval=1000),
                        request=req)
     rsp2 = self.mw.process_response(rsp)
     self.assertIs(rsp, rsp2)
 def test_encoded_url(self):
     body = '''<html><body><div><a href="?page=2">BinB</a></body></html>'''
     response = HtmlResponse("http://known.fm/AC%2FDC/",
                             body=body,
                             encoding='utf8')
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB'),
     ])
 def test_redirect_urls(self):
     req1 = Request('http://test.org/first')
     rsp1 = HtmlResponse(req1.url,
                         body=self._body(url='/redirected'),
                         request=req1)
     req2 = self.mw.process_response(rsp1)
     self.assertIsInstance(req2, Request)
     rsp2 = HtmlResponse(req2.url,
                         body=self._body(url='/redirected2'),
                         request=req2)
     req3 = self.mw.process_response(rsp2)
     self.assertIsInstance(req3, Request)
     self.assertEqual(req2.url, 'http://test.org/redirected')
     self.assertListEqual(req2.history, ['http://test.org/first'])
     self.assertEqual(req3.url, 'http://test.org/redirected2')
     self.assertListEqual(
         req3.history,
         ['http://test.org/first', 'http://test.org/redirected'])
    def test_tag(self):
        body = '''<html><head><link rel="canonical" href="%s" /></head></html>'''

        # absolute url
        req = Request('http://a.com/pom')
        rsp = HtmlResponse(req.url,
                           body=body % 'https://b.sk/hello',
                           request=req)
        rsp2 = self.mw.process_response(rsp)
        self.assertIs(rsp, rsp2)
        self.assertEqual(rsp.meta['canonical_url'], 'https://b.sk/hello')

        # relative url
        req = Request('http://a.com/pom')
        rsp = HtmlResponse(req.url, body=body % '/hello/world', request=req)
        rsp2 = self.mw.process_response(rsp)
        self.assertIs(rsp, rsp2)
        self.assertEqual(rsp.meta['canonical_url'], 'http://a.com/hello/world')
    def test_max_redirect_times(self):
        self.mw.max_redirect_times = 1
        req = Request('http://test.org/max')
        rsp = HtmlResponse(req.url, body=self._body(), request=req)

        req = self.mw.process_response(rsp)
        self.assertIsInstance(req, Request)
        self.assertEqual(len(req.history), 1)
        rsp.request = req
        self.assertIsNone(self.mw.process_response(rsp))
 def test_link_text_wrong_encoding(self):
     html = '''<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>'''
     response = HtmlResponse('http://www.example.com',
                             body=html,
                             encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://www.example.com/item/12.html',
              text=u'Wrong: \ufffd'),
     ])
 def test_invalid(self):
     # parser shouldn't fail or anything
     html = '''<?xml version="1.0" encoding="utf-8"?>
         <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
         <html xmlns="http://www.w3.org/1999/xhtml" lang="cs">
         <body>
         </body>
         </html>'''
     response = HtmlResponse('http://www.example.com',
                             body=html,
                             encoding='utf-8')
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual(lx.extract_links(response), [])
 def test_link_nofollow(self):
     html = '''
     <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a>
     <a href="about.html">About us</a>
     '''
     response = HtmlResponse('http://example.org/page.html', body=html)
     lx = LxmlLinkExtractor()
     self.assertEqual(lx.extract_links(response), [
         Link(url='http://example.org/page.html?action=print',
              text=u'Printer-friendly page',
              nofollow=True),
         Link(url='http://example.org/about.html',
              text=u'About us',
              nofollow=False)
     ])
    def test_open_in_browser(self):
        def browser_open(burl):
            path = urlparse.urlparse(burl).path
            if not os.path.exists(path):
                path = burl.replace('file://', '')
            bbody = open(path).read()
            self.assertIn('<base href="%s">' % url, bbody,
                          '<base> tag not added')
            return True

        url = 'http:///www.example.com/some/page.html'
        body = '<html> <head> <title>test page</title> </head> <body>test body</body> </html>'
        response = HtmlResponse(url, body=body)
        self.assertTrue(open_in_browser(response, _openfunc=browser_open),
                        'Browser not called')
        self.assertRaises(TypeError, open_in_browser, Response(url, body=body))
    def test_process_response_force_recalculate_encoding(self):
        headers = {
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
        }
        f = StringIO()
        plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">'''
        zf = GzipFile(fileobj=f, mode='wb')
        zf.write(plainbody)
        zf.close()
        response = HtmlResponse('http;//www.example.com/page.html',
                                headers=headers,
                                body=f.getvalue())

        new_response = self.mw.process_response(response)
        self.assertIsInstance(new_response, HtmlResponse)
        self.assertEqual(new_response.body, plainbody)
        self.assertEqual(new_response.encoding, normalize_encoding('gb2312'))
Exemple #16
0
    def test_css(self):
        body = get_testdata('pages', 'ip_page.html')
        response = HtmlResponse(url='http://myip.com/list', body=body)
        hxs = response.selector

        valid_ts = S('_',
                     css='div#main',
                     quant='1',
                     children=[
                         S('all_ip', css='span.ip', quant='7'),
                         S('_',
                           css='ul#ip_list',
                           quant='1',
                           children=[S('list_ip', css='span.ip', quant='6')]),
                     ])
        parsed = valid_ts.parse(hxs)
        self.assertRaises(TypeError, S, '_')
        self.assertRaises(TypeError, S, '_', 'div[@id="main]', css='div#main')
    def test_process_links(self):
        def _process(link):
            if link.url.endswith('othercat.html'):
                return None
            else:
                link.url = 'http://gogo.com/'
                return link

        html = '''<html><head><title>Page title<title>
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/">mimi</a></p>
        <p><a href="/hello">mimino</a></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/', body=html)
        lx = LxmlLinkExtractor()
        self.assertEqual(lx.extract_links(response, process_links=_process),
                         [Link(url='http://gogo.com/', text=u'>>')])
    def test_http_header_encoding_precedence(self):
        # u'\xa3'     = pound symbol in unicode
        # u'\xc2\xa3' = pound symbol in utf-8
        # u'\xa3'     = pound symbol in latin-1 (iso-8859-1)

        meta = u'<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">'
        head = u'<head>' + meta + u'</head>'
        body_content = u'<span id="blank">\xa3</span>'
        body = u'<body>' + body_content + u'</body>'
        html = u'<html>' + head + body + u'</html>'
        encoding = 'utf-8'
        html_utf8 = html.encode(encoding)

        headers = {'Content-Type': ['text/html; charset=utf-8']}
        response = HtmlResponse(url='http://example.com',
                                headers=headers,
                                body=html_utf8)
        x = self.hxs_cls(response)
        self.assertEquals(
            x.select('//span[@id="blank"]/text()').extract(), [u'\xa3'])
 def test_relative_paths(self):
     html = '''<html><head></head><body>
     <a href="hello/world/"></a>
     <a href="/hello/world/"></a>
     <a href="/hello/././world"></a>
     <a href="/hello/../world"></a>
     <a href="../hello/world"></a>
     <a href="./hello/world"></a>
     <a href="../../hello/././world"></a>
     </body></html>'''
     response = HtmlResponse('http://exmaple.org/yay/', body=html)
     lx = LxmlLinkExtractor(unique=False)
     self.assertEqual([link.url for link in lx.extract_links(response)], [
         'http://exmaple.org/yay/hello/world/',
         'http://exmaple.org/hello/world/',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/world',
         'http://exmaple.org/hello/world',
         'http://exmaple.org/yay/hello/world',
         'http://exmaple.org/hello/world',
     ])
    def test_meta_refresh_trough_posted_request(self):
        req = Request(url='http://example.org',
                      method='POST',
                      body='test',
                      headers={
                          'Content-Type': 'text/plain',
                          'Content-length': '4'
                      })
        rsp = HtmlResponse(req.url, body=self._body(), request=req)
        req2 = self.mw.process_response(rsp)

        self.assertIsInstance(req2, Request)
        self.assertEqual(req2.url, 'http://example.org/newpage')
        self.assertEqual(req2.method, 'GET')
        self.assertNotIn(
            'Content-Type', req2.headers,
            'Content-Type header must not be present in redirected request')
        self.assertNotIn(
            'Content-Length', req2.headers,
            'Content-Length header must not be present in redirected request')
        self.assertNot(req2.body,
                       'Redirected body must be empty, not `%s`' % req2.body)
    def test_basic(self):
        html = '''<html><head><title>Page title<title>
        <body><p><a href="item/12.html">Item 12</a></p>
        <p><a href="/about.html">About us</a></p>
        <img src="/logo.png" alt="Company logo (not a link)" />
        <p><a href="../othercat.html">Other category</a></p>
        <p><a href="/">&gt;&gt;</a></p>
        <p><a href="/" /></p>
        </body></html>'''
        response = HtmlResponse('http://example.org/somepage/index.html',
                                body=html)

        lx = LxmlLinkExtractor(unique=False)
        self.assertListEqual(lx.extract_links(response), [
            Link(url='http://example.org/somepage/item/12.html',
                 text='Item 12'),
            Link(url='http://example.org/about.html', text='About us'),
            Link(url='http://example.org/othercat.html',
                 text='Other category'),
            Link(url='http://example.org/', text='>>'),
            Link(url='http://example.org/', text='')
        ])
Exemple #22
0
 def setUp(self):
     self.resp = HtmlResponse('http://github.com/',
                              body='''<head>
     <base href="http://www.w3schools.com/" target="_blank"></head>
     <body></body>''')
Exemple #23
0
    def test_xpath(self):
        body = get_testdata('pages', 'ip_page.html')
        response = HtmlResponse(url='http://myip.com/list', body=body)
        hxs = response.selector

        # test valid parsing
        valid_ts = S(
            '_',
            '//div[@id="main"]',
            quant='1',
            children=[
                S('title', 'h1', quant='1', value='text()'),
                S('full_title_script',
                  'h1|div[@id="subtitle"]/h2',
                  quant='2',
                  value='descendant-or-self::text()'),
                # although the following statement gives the same elements, it gives them in different order
                # S('full_title_script', '(h1|div[@id="subtitle"]/h2)/descendant-or-self::*', quant='+', value='text()'),
                S('full_title_no_script',
                  'h1|div[@id="subtitle"]/h2',
                  quant='2',
                  value='descendant-or-self::*[name()!="script"]/text()'),
                S('full_title_script_bad',
                  '(h1|div[@id="subtitle"]/h2)//*',
                  quant='+',
                  value='text()'),
                S('_list',
                  'ul[@id="ip_list"]',
                  quant='1',
                  children=[
                      S('_ips',
                        'li',
                        quant='6',
                        group='ips',
                        children=[
                            S('ip',
                              'span[@class="ip"]',
                              quant='1',
                              value='text()'),
                            S('port',
                              'span[@class="port"]',
                              quant='1',
                              value='text()'),
                            S('ip_port',
                              'self::*',
                              value='descendant-or-self::text()'),
                        ])
                  ]),
                S('url',
                  'descendant-or-self::a',
                  quant='1',
                  value='@href',
                  callback=S.absolute_url),
                S('empty', 'div[@id="empty"]', quant='1', value='text()'),
                S('footer',
                  'following-sibling::div[@id="footer"]',
                  quant='1',
                  children=[
                      S('footer_links',
                        'a',
                        quant='+',
                        value='@href',
                        callback=S.absolute_url)
                  ]),
                S('nonexistent', 'div/div/div', quant='?', value='text()')
            ])

        # validation without context, when context is expected
        self.assertRaises(SValidationError, valid_ts.parse, hxs)

        parsed = valid_ts.parse(response)

        self.assertItemsEqual(parsed, [
            'title', 'full_title_script', 'full_title_no_script',
            'full_title_script_bad', 'ips', 'url', 'empty', 'footer',
            'footer_links'
        ])  # nonexistent is missing!

        # title
        self.assertIsInstance(parsed['title'], list)
        self.assertListEqual(parsed['title'],
                             [u'Here is the list of some ', u' addresses '
                              ])  # text inside strong is not pased
        # full_title
        self.assertListEqual(parsed['full_title_script'], [
            u'Here is the list of some ', u'ip', u' addresses ', u'!!!',
            u'Just ', u'some', u' ', u'this is bad', u' other text.'
        ])  # order of text nodes is perserved
        self.assertListEqual(parsed['full_title_no_script'], [
            u'Here is the list of some ', u'ip', u' addresses ', u'!!!',
            u'Just ', u'some', u' ', u' other text.'
        ])  # same result as before, excluding script content
        self.assertListEqual(parsed['full_title_script_bad'],
                             [u'ip', u'!!!', u'some', u'this is bad'
                              ])  # this only took the inner nodes
        # ips
        self.assertIsInstance(parsed['ips'], list)
        self.assertEqual(len(parsed['ips']), 6)
        first = parsed['ips'][0]
        self.assertIsInstance(first, defaultdict)
        self.assertItemsEqual(first, ['ip', 'port', 'ip_port'])
        self.assertListEqual(first['ip'], [u'123.44.1.9'])
        self.assertIsInstance(first['ip'][0],
                              unicode)  # parsed objects are always unicode
        self.assertListEqual(first['port'], [u'80'])
        self.assertListEqual(first['ip_port'], [u'123.44.1.9', u':', u'80'])
        # url
        self.assertListEqual(parsed['url'], [u'http://myip.com/url1'])
        self.assertIsInstance(
            parsed['url'][0],
            unicode)  # even urls are unicode after being processed
        # empty
        self.assertListEqual(
            parsed['empty'], []
        )  # even though we matched 1 tag empty, the was no text and the returned list is empty
        # footer
        self.assertIsInstance(parsed['footer'][0], HtmlXPathSelector)
        self.assertListEqual(parsed['footer_links'],
                             [u'http://myip.com/url2', u'http://google.com/'])
    def test_get_meta_refresh(self):
        url = 'http://example.org'
        body = '''
            <html>
                <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head>
                <body>blahablsdfsal&amp;</body>
            </html>'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (5, 'http://example.org/newpage'))

        # refresh without url should return (None, None)
        body = '''<meta http-equiv="refresh" content="5" />'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (None, None))

        body = '''<meta http-equiv="refresh" content="5;
            url=http://example.org/newpage" /></head>'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (5, 'http://example.org/newpage'))

        # meta refresh in multiple lines
        body = '''<html><head>
<META
HTTP-EQUIV="Refresh"
CONTENT="1; URL=http://example.org/newpage">'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (1, 'http://example.org/newpage'))

        # entities in the redirect url
        body = '''<meta http-equiv="refresh" content="3; url=&#39;http://www.example.com/other&#39;">'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (3, 'http://www.example.com/other'))

        url = 'http://example.com/page/this.html'
        # relative redirects
        body = '''<meta http-equiv="refresh" content="3; url=other.html">'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (3, 'http://example.com/page/other.html'))

        # non-standard encodings (utf-16)
        url = 'http://example.com'
        body = '''<meta http-equiv="refresh" content="3; url=http://example.com/redirect">'''
        body = body.decode('ascii').encode('utf-16')
        self.assertEqual(
            get_meta_refresh(HtmlResponse(url, body=body, encoding='utf-16')),
            (3,
             'http://example.com/%FF%FEh%00t%00t%00p%00:%00/%00/%00e%00x%00a%00m%00p%00l%00e%00.%00c%00o%00m%00/%00r%00e%00d%00i%00r%00e%00c%00t%00'
             ))

        # non-ascii chars in the url (utf8 - default)
        body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (3, 'http://example.com/to%C2%A3'))

        # non-ascii chars in the url (latin1)
        body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">'''
        self.assertEqual(
            get_meta_refresh(HtmlResponse(url, body=body, encoding='latin1')),
            (3, 'http://example.com/to%A3'))

        # html commented meta refresh header must not directed
        body = '''<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (None, None))

        # html comments must not interfere with uncommented meta refresh header
        body = '''<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (3, 'http://example.com/'))

        # float refresh intervals
        body = '''<meta http-equiv="refresh" content=".1;URL=index.html" />'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (0.1, 'http://example.com/index.html'))

        body = '''<meta http-equiv="refresh" content="3.1;URL=index.html" />'''
        self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)),
                         (3.1, 'http://example.com/index.html'))
 def setUp(self):
     body = get_testdata('link_extractor', 'sgml_linkextractor.html')
     self.response = HtmlResponse(url='http://example.com/index', body=body)
 def test_priority_adjust(self):
     req = Request('http://a.com')
     rsp = HtmlResponse(req.url, body=self._body(), request=req)
     req2 = self.mw.process_response(rsp)
     self.assertTrue(req2.priority > req.priority)
 def test_meta_refresh(self):
     req = Request(url='http://example.org')
     rsp = HtmlResponse(req.url, body=self._body(), request=req)
     req2 = self.mw.process_response(rsp)
     self.assertIsInstance(req2, Request)
     self.assertEqual(req2.url, 'http://example.org/newpage')
Exemple #28
0
 def setUp(self):
     body = get_testdata('pages', 'ip_page.html')
     response = HtmlResponse(url='http://myip.com/list', body=body)
     hxs = response.selector
     self.parsed = basic_ts.parse(hxs)
 def test_empty_body(self):
     lx = LxmlLinkExtractor()
     response = HtmlResponse('http://www.example.com')
     self.assertEqual(lx.extract_links(response), [])
     response = HtmlResponse('http://www.example.com', body='\n\r\n\n')
     self.assertEqual(lx.extract_links(response), [])