def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) # document encoding does not affect URL path component, only query part # >>> u'sample_ñ.html'.encode('utf8') # b'sample_\xc3\xb1.html' # >>> u"sample_á.html".encode('utf8') # b'sample_\xc3\xa1.html' # >>> u"sample_ö.html".encode('utf8') # b'sample_\xc3\xb6.html' # >>> u"£32".encode('latin1') # b'\xa332' # >>> u"µ".encode('latin1') # b'\xb5' self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%C3%A1.html', text='sample \xe1 text'.decode('latin1')), Link(url='http://example.com/sample_%C3%B6.html?price=%A332&%B5=unit', text=''), ])
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse( url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [ Link(url='http://otherdomain.com/base/item/12.html', text='Item 12') ]) # base url is an absolute path and relative to host html = """<html><head><title>Page title<title><base href="/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual( lx.extract_links(response), [Link(url='https://example.org/item/12.html', text='Item 12')]) # base url has no scheme html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [ Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12') ])
def test_extraction_encoding(self): body = get_testdata("link_extractor", "linkextractor_noenc.html") response_utf8 = HtmlResponse( url="http://example.com/utf8", body=body, headers={"Content-Type": ["text/html; charset=utf-8"]} ) response_noenc = HtmlResponse(url="http://example.com/noenc", body=body) body = get_testdata("link_extractor", "linkextractor_latin1.html") response_latin1 = HtmlResponse(url="http://example.com/latin1", body=body) lx = BaseSgmlLinkExtractor() self.assertEqual( lx.extract_links(response_utf8), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_noenc), [ Link(url="http://example.com/sample_%C3%B1.html", text=""), Link(url="http://example.com/sample_%E2%82%AC.html", text="sample \xe2\x82\xac text".decode("utf-8")), ], ) self.assertEqual( lx.extract_links(response_latin1), [ Link(url="http://example.com/sample_%F1.html", text=""), Link(url="http://example.com/sample_%E1.html", text="sample \xe1 text".decode("latin1")), ], )
def test_link_text_wrong_encoding(self): html = """<body><p><a href="item/12.html">Wrong: \xed</a></p></body></html>""" response = HtmlResponse("http://www.example.com", body=html, encoding='utf-8') lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response), [ Link(url='http://www.example.com/item/12.html', text=u'Wrong: \ufffd'), ])
def test_extraction_encoding(self): body = get_testdata('link_extractor', 'linkextractor_noenc.html') response_utf8 = HtmlResponse(url='http://example.com/utf8', body=body, headers={'Content-Type': ['text/html; charset=utf-8']}) response_noenc = HtmlResponse(url='http://example.com/noenc', body=body) body = get_testdata('link_extractor', 'linkextractor_latin1.html') response_latin1 = HtmlResponse(url='http://example.com/latin1', body=body) lx = BaseSgmlLinkExtractor() self.assertEqual(lx.extract_links(response_utf8), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_noenc), [ Link(url='http://example.com/sample_%C3%B1.html', text=''), Link(url='http://example.com/sample_%E2%82%AC.html', text='sample \xe2\x82\xac text'.decode('utf-8')), ]) self.assertEqual(lx.extract_links(response_latin1), [ Link(url='http://example.com/sample_%F1.html', text=''), Link(url='http://example.com/sample_%E1.html', text='sample \xe1 text'.decode('latin1')), ])
def test_base_url(self): html = """<html><head><title>Page title<title><base href="http://otherdomain.com/base/" /> <body><p><a href="item/12.html">Item 12</a></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://otherdomain.com/base/item/12.html', text='Item 12')]) # base url is an absolute path and relative to host html = """<html><head><title>Page title<title><base href="/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [Link(url='https://example.org/item/12.html', text='Item 12')]) # base url has no scheme html = """<html><head><title>Page title<title><base href="//noschemedomain.com/path/to/" /> <body><p><a href="item/12.html">Item 12</a></p></body></html>""" response = HtmlResponse("https://example.org/somepage/index.html", body=html) self.assertEqual(lx.extract_links(response), [Link(url='https://noschemedomain.com/path/to/item/12.html', text='Item 12')])
def test_basic(self): html = """<html><head><title>Page title<title> <body><p><a href="item/12.html">Item 12</a></p> <p><a href="/about.html">About us</a></p> <img src="/logo.png" alt="Company logo (not a link)" /> <p><a href="../othercat.html">Other category</a></p> <p><a href="/">>></a></p> <p><a href="/" /></p> </body></html>""" response = HtmlResponse("http://example.org/somepage/index.html", body=html) lx = BaseSgmlLinkExtractor() # default: tag=a, attr=href self.assertEqual(lx.extract_links(response), [Link(url='http://example.org/somepage/item/12.html', text='Item 12'), Link(url='http://example.org/about.html', text='About us'), Link(url='http://example.org/othercat.html', text='Other category'), Link(url='http://example.org/', text='>>'), Link(url='http://example.org/', text='')])