def test_encoding_w3c(p_w3c_test, p_expected_encoding): from cosrlib.document.html import HTMLDocument test_file = os.path.join( "tests/testdata/html_w3c_encoding_testcases", "the-input-byte-stream-%s.html" % p_w3c_test ) with open(test_file, "rb") as f: headers = {} if os.path.isfile(test_file + ".headers"): with open(test_file + ".headers", "rb") as hf: headers["content-type"] = hf.read()[14:].strip() html = f.read() # print repr(html[0:10]) doc = HTMLDocument(html, url=None, headers=headers) if p_expected_encoding is None: assert doc.encoding.detect() is None else: assert doc.encoding.detect().name == p_expected_encoding doc.parse()
def format_html_title(title, url=None): doc = HTMLDocument(""" <html><head><title>%s</title></head><body>Hello</body></html> """ % title, url=url) doc.parse() return format_title(doc, {})
def test_encoding_xml(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument("""<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html> <html lang="en" ></html> """) assert doc.encoding.detect().name == "shift_jis" doc.parse()
def test_encoding_x_user_defined(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "x-user-defined" doc.parse()
def test_encoding_xml(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html> <html lang="en" ></html> """) assert doc.encoding.detect().name == "shift_jis" doc.parse()
def test_reparse(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument("""<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>""") assert doc.encoding.detect().name == "iso8859-15" # A re-parsing of the document should be triggered, gumbo only accepts utf-8 doc.parse() assert doc.get_title() == "Mac\xc3\xa9o"
def test_encoding_aliases(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument("""<html><head><meta charset="tis-620"></head><body>Hello</body></html>""") assert doc.encoding.detect().name == "cp874" doc.parse() doc = HTMLDocument("""<html><head><meta charset="windows-874"></head><body>Hello</body></html>""") assert doc.encoding.detect().name == "cp874" doc.parse()
def test_reparse(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>""" ) assert doc.encoding.detect().name == "iso8859-15" # A re-parsing of the document should be triggered, gumbo only accepts utf-8 doc.parse() assert doc.get_title() == "Mac\xc3\xa9o"
def test_encoding_aliases(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument( """<html><head><meta charset="tis-620"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "cp874" doc.parse() doc = HTMLDocument( """<html><head><meta charset="windows-874"></head><body>Hello</body></html>""" ) assert doc.encoding.detect().name == "cp874" doc.parse()
def test_encoding_w3c(p_w3c_test, p_expected_encoding): from cosrlib.document.html import HTMLDocument test_file = os.path.join("tests/testdata/html_w3c_encoding_testcases", "the-input-byte-stream-%s.html" % p_w3c_test) with open(test_file, "rb") as f: headers = {} if os.path.isfile(test_file + ".headers"): with open(test_file + ".headers", "rb") as hf: headers["content-type"] = hf.read()[14:].strip() html = f.read() # print repr(html[0:10]) doc = HTMLDocument(html, url=None, headers=headers) if p_expected_encoding is None: assert doc.encoding.detect() is None else: assert doc.encoding.detect().name == p_expected_encoding doc.parse()
def test_encoding_x_user_defined(): from cosrlib.document.html import HTMLDocument doc = HTMLDocument("""<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>""") assert doc.encoding.detect().name == "x-user-defined" doc.parse()
def format_html_title(title, url=None): doc = HTMLDocument(""" <html><head><meta charset="UTF-8"><title>%s</title></head><body>Hello</body></html> """ % title, url=url) doc.parse() return format_title(doc, {})