def test_basic_prescan_length(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode( 'utf-8') pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity stream = inputstream.HTMLBinaryInputStream(data, chardet=False) assert 'utf-8' == stream.charEncoding[0].name
def runPreScanEncodingTest(data, encoding): stream = inputstream.HTMLBinaryInputStream(data, chardet=False) encoding = encoding.lower().decode('ascii') # Very crude way to ignore irrelevant tests if len(data) > stream.numBytesMeta: return assert encoding == stream.charEncoding[0], errorMessage(data, encoding, stream.charEncoding[0])
def test_parser_reparse(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode( 'utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = inputstream.HTMLBinaryInputStream(data, chardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9"