def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = LxmlLinkExtractor(unique=False) self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_base_url(self): html = '''<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>''' response = HtmlResponse('http://example.org/somepage/index.html', body=html) lx = LxmlLinkExtractor(unique=False) self.assertEqual(lx.extract_links(response), [ Link(url='http://otherdomain.com/base/item/12.html', text='Item 12') ]) # base url is an absolute path and relative to host html = '''<html><head><title>Page title<title><base href="/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>''' response = HtmlResponse('https://example.org/somepage/index.html', body=html) self.assertEqual( lx.extract_links(response), [Link(url='https://example.org/item/12.html', text='Item 12')]) # base url has no scheme html = '''<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>''' response = HtmlResponse('https://example.org/somepage/index.html', body=html) self.assertEqual(lx.extract_links(response), [ Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12') ])
def test_nothing(self): body = '''<html><head></head><body></body></html>''' req = Request('http://a.com') rsp = HtmlResponse(req.url, body=body, request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertNotIn('canonical_url', rsp.meta)
def test_deny_extensions(self): html = '''<a href="page.html">asd</a> and <a href="photo.jpg">''' response = HtmlResponse('http://example.org/', body=html) lx = LxmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html', text=u'asd'), ])
def test_selector_nested(self): '''Nested selector tests.''' body = '''<body> <div class='one'> <ul> <li>one</li><li>two</li> </ul> </div> <div class='two'> <ul> <li>four</li><li>five</li><li>six</li> </ul> </div> </body>''' response = HtmlResponse(url='http://example.com"', body=body) x = self.hxs_cls(response) divtwo = x.select('//div[@class="two"]') self.assertEqual(map(unicode.strip, divtwo.select('//li').extract()), [ '<li>one</li>', '<li>two</li>', '<li>four</li>', '<li>five</li>', '<li>six</li>' ]) self.assertEqual( map(unicode.strip, divtwo.select('./ul/li').extract()), ['<li>four</li>', '<li>five</li>', '<li>six</li>']) self.assertEqual(map(unicode.strip, divtwo.select('.//li').extract()), ['<li>four</li>', '<li>five</li>', '<li>six</li>']) self.assertEqual(divtwo.select('./li').extract(), [])
def test_meta_refresh_with_high_interval(self): # meta-refresh with high intervals don't trigger redirects req = Request(url='http://example.org') rsp = HtmlResponse(url='http://example.org', body=self._body(interval=1000), request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2)
def test_encoded_url(self): body = '''<html><body><div><a href="?page=2">BinB</a></body></html>''' response = HtmlResponse("http://known.fm/AC%2FDC/", body=body, encoding='utf8') lx = LxmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://known.fm/AC%2FDC/?page=2', text=u'BinB'), ])
def test_redirect_urls(self): req1 = Request('http://test.org/first') rsp1 = HtmlResponse(req1.url, body=self._body(url='/redirected'), request=req1) req2 = self.mw.process_response(rsp1) self.assertIsInstance(req2, Request) rsp2 = HtmlResponse(req2.url, body=self._body(url='/redirected2'), request=req2) req3 = self.mw.process_response(rsp2) self.assertIsInstance(req3, Request) self.assertEqual(req2.url, 'http://test.org/redirected') self.assertListEqual(req2.history, ['http://test.org/first']) self.assertEqual(req3.url, 'http://test.org/redirected2') self.assertListEqual( req3.history, ['http://test.org/first', 'http://test.org/redirected'])
def test_tag(self): body = '''<html><head><link rel="canonical" href="%s" /></head></html>''' # absolute url req = Request('http://a.com/pom') rsp = HtmlResponse(req.url, body=body % 'https://b.sk/hello', request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'https://b.sk/hello') # relative url req = Request('http://a.com/pom') rsp = HtmlResponse(req.url, body=body % '/hello/world', request=req) rsp2 = self.mw.process_response(rsp) self.assertIs(rsp, rsp2) self.assertEqual(rsp.meta['canonical_url'], 'http://a.com/hello/world')
def test_max_redirect_times(self): self.mw.max_redirect_times = 1 req = Request('http://test.org/max') rsp = HtmlResponse(req.url, body=self._body(), request=req) req = self.mw.process_response(rsp) self.assertIsInstance(req, Request) self.assertEqual(len(req.history), 1) rsp.request = req self.assertIsNone(self.mw.process_response(rsp))
def test_link_text_wrong_encoding(self): html = '''<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>''' response = HtmlResponse('http://www.example.com', body=html, encoding='utf-8') lx = LxmlLinkExtractor(unique=False) self.assertEqual(lx.extract_links(response), [ Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'), ])
def test_invalid(self): # parser shouldn't fail or anything html = '''<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="cs"> <body> </body> </html>''' response = HtmlResponse('http://www.example.com', body=html, encoding='utf-8') lx = LxmlLinkExtractor(unique=False) self.assertEqual(lx.extract_links(response), [])
def test_link_nofollow(self): html = ''' <a href="page.html?action=print" rel="nofollow">Printer-friendly page</a> <a href="about.html">About us</a> ''' response = HtmlResponse('http://example.org/page.html', body=html) lx = LxmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://example.org/page.html?action=print', text=u'Printer-friendly page', nofollow=True), Link(url='http://example.org/about.html', text=u'About us', nofollow=False) ])
def test_open_in_browser(self): def browser_open(burl): path = urlparse.urlparse(burl).path if not os.path.exists(path): path = burl.replace('file://', '') bbody = open(path).read() self.assertIn('<base href="%s">' % url, bbody, '<base> tag not added') return True url = 'http:///www.example.com/some/page.html' body = '<html> <head> <title>test page</title> </head> <body>test body</body> </html>' response = HtmlResponse(url, body=body) self.assertTrue(open_in_browser(response, _openfunc=browser_open), 'Browser not called') self.assertRaises(TypeError, open_in_browser, Response(url, body=body))
def test_process_response_force_recalculate_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">''' zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = HtmlResponse('http;//www.example.com/page.html', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding('gb2312'))
def test_css(self): body = get_testdata('pages', 'ip_page.html') response = HtmlResponse(url='http://myip.com/list', body=body) hxs = response.selector valid_ts = S('_', css='div#main', quant='1', children=[ S('all_ip', css='span.ip', quant='7'), S('_', css='ul#ip_list', quant='1', children=[S('list_ip', css='span.ip', quant='6')]), ]) parsed = valid_ts.parse(hxs) self.assertRaises(TypeError, S, '_') self.assertRaises(TypeError, S, '_', 'div[@id="main]', css='div#main')
def test_process_links(self): def _process(link): if link.url.endswith('othercat.html'): return None else: link.url = 'http://gogo.com/' return link html = '''<html><head><title>Page title<title> <p><a href="../othercat.html">Other category</a></p> <p><a href="/">>></a></p> <p><a href="/">mimi</a></p> <p><a href="/hello">mimino</a></p> </body></html>''' response = HtmlResponse('http://example.org/', body=html) lx = LxmlLinkExtractor() self.assertEqual(lx.extract_links(response, process_links=_process), [Link(url='http://gogo.com/', text=u'>>')])
def test_http_header_encoding_precedence(self): # u'\xa3' = pound symbol in unicode # u'\xc2\xa3' = pound symbol in utf-8 # u'\xa3' = pound symbol in latin-1 (iso-8859-1) meta = u'<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' head = u'<head>' + meta + u'</head>' body_content = u'<span id="blank">\xa3</span>' body = u'<body>' + body_content + u'</body>' html = u'<html>' + head + body + u'</html>' encoding = 'utf-8' html_utf8 = html.encode(encoding) headers = {'Content-Type': ['text/html; charset=utf-8']} response = HtmlResponse(url='http://example.com', headers=headers, body=html_utf8) x = self.hxs_cls(response) self.assertEquals( x.select('//span[@id="blank"]/text()').extract(), [u'\xa3'])
def test_relative_paths(self): html = '''<html><head></head><body> <a href="hello/world/"></a> <a href="/hello/world/"></a> <a href="/hello/././world"></a> <a href="/hello/../world"></a> <a href="../hello/world"></a> <a href="./hello/world"></a> <a href="../../hello/././world"></a> </body></html>''' response = HtmlResponse('http://exmaple.org/yay/', body=html) lx = LxmlLinkExtractor(unique=False) self.assertEqual([link.url for link in lx.extract_links(response)], [ 'http://exmaple.org/yay/hello/world/', 'http://exmaple.org/hello/world/', 'http://exmaple.org/hello/world', 'http://exmaple.org/world', 'http://exmaple.org/hello/world', 'http://exmaple.org/yay/hello/world', 'http://exmaple.org/hello/world', ])
def test_meta_refresh_trough_posted_request(self): req = Request(url='http://example.org', method='POST', body='test', headers={ 'Content-Type': 'text/plain', 'Content-length': '4' }) rsp = HtmlResponse(req.url, body=self._body(), request=req) req2 = self.mw.process_response(rsp) self.assertIsInstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage') self.assertEqual(req2.method, 'GET') self.assertNotIn( 'Content-Type', req2.headers, 'Content-Type header must not be present in redirected request') self.assertNotIn( 'Content-Length', req2.headers, 'Content-Length header must not be present in redirected request') self.assertNot(req2.body, 'Redirected body must be empty, not `%s`' % req2.body)
def test_basic(self): html = '''<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/">>></a></p> <p><a href="/" /></p> </body></html>''' response = HtmlResponse('http://example.org/somepage/index.html', body=html) lx = LxmlLinkExtractor(unique=False) self.assertListEqual(lx.extract_links(response), [ Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='>>'), Link(url='http://example.org/', text='') ])
def setUp(self): self.resp = HtmlResponse('http://github.com/', body='''<head> <base href="http://www.w3schools.com/" target="_blank"></head> <body></body>''')
def test_xpath(self): body = get_testdata('pages', 'ip_page.html') response = HtmlResponse(url='http://myip.com/list', body=body) hxs = response.selector # test valid parsing valid_ts = S( '_', '//div[@id="main"]', quant='1', children=[ S('title', 'h1', quant='1', value='text()'), S('full_title_script', 'h1|div[@id="subtitle"]/h2', quant='2', value='descendant-or-self::text()'), # although the following statement gives the same elements, it gives them in different order # S('full_title_script', '(h1|div[@id="subtitle"]/h2)/descendant-or-self::*', quant='+', value='text()'), S('full_title_no_script', 'h1|div[@id="subtitle"]/h2', quant='2', value='descendant-or-self::*[name()!="script"]/text()'), S('full_title_script_bad', '(h1|div[@id="subtitle"]/h2)//*', quant='+', value='text()'), S('_list', 'ul[@id="ip_list"]', quant='1', children=[ S('_ips', 'li', quant='6', group='ips', children=[ S('ip', 'span[@class="ip"]', quant='1', value='text()'), S('port', 'span[@class="port"]', quant='1', value='text()'), S('ip_port', 'self::*', value='descendant-or-self::text()'), ]) ]), S('url', 'descendant-or-self::a', quant='1', value='@href', callback=S.absolute_url), S('empty', 'div[@id="empty"]', quant='1', value='text()'), S('footer', 'following-sibling::div[@id="footer"]', quant='1', children=[ S('footer_links', 'a', quant='+', value='@href', callback=S.absolute_url) ]), S('nonexistent', 'div/div/div', quant='?', value='text()') ]) # validation without context, when context is expected self.assertRaises(SValidationError, valid_ts.parse, hxs) parsed = valid_ts.parse(response) self.assertItemsEqual(parsed, [ 'title', 'full_title_script', 'full_title_no_script', 'full_title_script_bad', 'ips', 'url', 'empty', 'footer', 'footer_links' ]) # nonexistent is missing! # title self.assertIsInstance(parsed['title'], list) self.assertListEqual(parsed['title'], [u'Here is the list of some ', u' addresses ' ]) # text inside strong is not pased # full_title self.assertListEqual(parsed['full_title_script'], [ u'Here is the list of some ', u'ip', u' addresses ', u'!!!', u'Just ', u'some', u' ', u'this is bad', u' other text.' ]) # order of text nodes is perserved self.assertListEqual(parsed['full_title_no_script'], [ u'Here is the list of some ', u'ip', u' addresses ', u'!!!', u'Just ', u'some', u' ', u' other text.' ]) # same result as before, excluding script content self.assertListEqual(parsed['full_title_script_bad'], [u'ip', u'!!!', u'some', u'this is bad' ]) # this only took the inner nodes # ips self.assertIsInstance(parsed['ips'], list) self.assertEqual(len(parsed['ips']), 6) first = parsed['ips'][0] self.assertIsInstance(first, defaultdict) self.assertItemsEqual(first, ['ip', 'port', 'ip_port']) self.assertListEqual(first['ip'], [u'123.44.1.9']) self.assertIsInstance(first['ip'][0], unicode) # parsed objects are always unicode self.assertListEqual(first['port'], [u'80']) self.assertListEqual(first['ip_port'], [u'123.44.1.9', u':', u'80']) # url self.assertListEqual(parsed['url'], [u'http://myip.com/url1']) self.assertIsInstance( parsed['url'][0], unicode) # even urls are unicode after being processed # empty self.assertListEqual( parsed['empty'], [] ) # even though we matched 1 tag empty, the was no text and the returned list is empty # footer self.assertIsInstance(parsed['footer'][0], HtmlXPathSelector) self.assertListEqual(parsed['footer_links'], [u'http://myip.com/url2', u'http://google.com/'])
def test_get_meta_refresh(self): url = 'http://example.org' body = ''' <html> <head><title>Dummy</title><meta http-equiv="refresh" content="5;url=http://example.org/newpage" /></head> <body>blahablsdfsal&</body> </html>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # refresh without url should return (None, None) body = '''<meta http-equiv="refresh" content="5" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) body = '''<meta http-equiv="refresh" content="5; url=http://example.org/newpage" /></head>''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (5, 'http://example.org/newpage')) # meta refresh in multiple lines body = '''<html><head> <META HTTP-EQUIV="Refresh" CONTENT="1; URL=http://example.org/newpage">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (1, 'http://example.org/newpage')) # entities in the redirect url body = '''<meta http-equiv="refresh" content="3; url='http://www.example.com/other'">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://www.example.com/other')) url = 'http://example.com/page/this.html' # relative redirects body = '''<meta http-equiv="refresh" content="3; url=other.html">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/page/other.html')) # non-standard encodings (utf-16) url = 'http://example.com' body = '''<meta http-equiv="refresh" content="3; url=http://example.com/redirect">''' body = body.decode('ascii').encode('utf-16') self.assertEqual( get_meta_refresh(HtmlResponse(url, body=body, encoding='utf-16')), (3, 'http://example.com/%FF%FEh%00t%00t%00p%00:%00/%00/%00e%00x%00a%00m%00p%00l%00e%00.%00c%00o%00m%00/%00r%00e%00d%00i%00r%00e%00c%00t%00' )) # non-ascii chars in the url (utf8 - default) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xc2\xa3">''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/to%C2%A3')) # non-ascii chars in the url (latin1) body = '''<meta http-equiv="refresh" content="3; url=http://example.com/to\xa3">''' self.assertEqual( get_meta_refresh(HtmlResponse(url, body=body, encoding='latin1')), (3, 'http://example.com/to%A3')) # html commented meta refresh header must not directed body = '''<!--<meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (None, None)) # html comments must not interfere with uncommented meta refresh header body = '''<!-- commented --><meta http-equiv="refresh" content="3; url=http://example.com/">-->''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3, 'http://example.com/')) # float refresh intervals body = '''<meta http-equiv="refresh" content=".1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (0.1, 'http://example.com/index.html')) body = '''<meta http-equiv="refresh" content="3.1;URL=index.html" />''' self.assertEqual(get_meta_refresh(HtmlResponse(url, body=body)), (3.1, 'http://example.com/index.html'))
def setUp(self): body = get_testdata('link_extractor', 'sgml_linkextractor.html') self.response = HtmlResponse(url='http://example.com/index', body=body)
def test_priority_adjust(self): req = Request('http://a.com') rsp = HtmlResponse(req.url, body=self._body(), request=req) req2 = self.mw.process_response(rsp) self.assertTrue(req2.priority > req.priority)
def test_meta_refresh(self): req = Request(url='http://example.org') rsp = HtmlResponse(req.url, body=self._body(), request=req) req2 = self.mw.process_response(rsp) self.assertIsInstance(req2, Request) self.assertEqual(req2.url, 'http://example.org/newpage')
def setUp(self): body = get_testdata('pages', 'ip_page.html') response = HtmlResponse(url='http://myip.com/list', body=body) hxs = response.selector self.parsed = basic_ts.parse(hxs)
def test_empty_body(self): lx = LxmlLinkExtractor() response = HtmlResponse('http://www.example.com') self.assertEqual(lx.extract_links(response), []) response = HtmlResponse('http://www.example.com', body='\n\r\n\n') self.assertEqual(lx.extract_links(response), [])