def test_basic_prescan_length(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8') pad = 1024 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 1024 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'utf-8' == stream.charEncoding[0].name
def test_parser_args(expected, data, kwargs): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs) assert expected == stream.charEncoding[0].name p = HTMLParser() p.parse(data, useChardet=False, **kwargs) assert expected == p.documentEncoding
def test_prescan_encoding(data, encoding): stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) encoding = encoding.lower().decode("ascii") # Very crude way to ignore irrelevant tests if len(data) > stream.numBytesMeta: return assert encoding == stream.charEncoding[0].name, errorMessage(data, encoding, stream.charEncoding[0].name)
def test_parser_reparse(): data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8') pad = 10240 - len(data) + 1 data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-") assert len(data) == 10240 # Sanity stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False) assert 'windows-1252' == stream.charEncoding[0].name p = HTMLParser(namespaceHTMLElements=False) doc = p.parse(data, useChardet=False) assert 'utf-8' == p.documentEncoding assert doc.find(".//title").text == "Caf\u00E9"