def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = parse(raw, keep_doctype=False, namespace_elements=False, maybe_xhtml=False, sanitize_names=True) root = next(root.iterdescendants('div')) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding='unicode') try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def html_to_lxml(raw): raw = '<div>%s</div>' % raw root = html.fragment_fromstring(raw) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding=None) try: return safe_xml_fromstring(raw, recover=False) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)
def html_to_lxml(raw): raw = u'<div>%s</div>'%raw root = html.fragment_fromstring(raw) root.set('xmlns', "http://www.w3.org/1999/xhtml") raw = etree.tostring(root, encoding=None) try: return etree.fromstring(raw) except: for x in root.iterdescendants(): remove = [] for attr in x.attrib: if ':' in attr: remove.append(attr) for a in remove: del x.attrib[a] raw = etree.tostring(root, encoding=None) try: return etree.fromstring(raw) except: from calibre.ebooks.oeb.parse_utils import _html4_parse return _html4_parse(raw)