def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) stream_class = partial(FastStream, track_position=line_numbers) stream = stream_class(raw) builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) while True: try: parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DataLossWarning) try: parser.parse(stream, parseMeta=False, useChardet=False) finally: parser.tree.proxy_cache = None except NamespacedHTMLPresent as err: raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) stream = stream_class(raw) continue break root = parser.tree.getDocument() if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root