def html5_parse(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) from html5_parser import parse root = parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError( 'Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) if ans.tag != '{%s}html' % html_ns: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def fget(self): ans = u'' try: check = unicode(self.page().mainFrame().toPlainText()).strip() raw = unicode(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except: root = fromstring(raw) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding=unicode) for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = u'<div>%s</div>'%(u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace('<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') except etree.XMLSyntaxError: from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER opf = etree.fromstring(data, parser=RECOVER_PARSER) self.logger.warn('OPF contains invalid markup, trying to parse it anyway') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf
def html(self): raw = original_html = self.toHtml() check = self.toPlainText().strip() raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return '' root = parse(raw, maybe_xhtml=False, sanitize_names=True) if root.xpath('//meta[@name="calibre-dont-sanitize"]'): # Bypass cleanup if special meta tag exists return original_html try: cleanup_qt_markup(root) except Exception: import traceback traceback.print_exc() elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = '<div>%s</div>'%(u''.join(elems)) else: ans = ''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans return xml_replace_entities(ans)
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = fix_self_closing_cdata_tags(raw) # TODO: Handle this in the parser if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = replace_chars.sub('', raw) stream_class = partial(FastStream, track_position=line_numbers) stream = stream_class(raw) builder = partial(NoNamespaceTreeBuilder if discard_namespaces else TreeBuilder, linenumber_attribute=linenumber_attribute) while True: try: parser = HTMLParser(tree=builder, track_positions=line_numbers, namespaceHTMLElements=not discard_namespaces) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=DataLossWarning) try: parser.parse(stream, parseMeta=False, useChardet=False) finally: parser.tree.proxy_cache = None except NamespacedHTMLPresent as err: raw = re.sub(r'<\s*/{0,1}(%s:)' % err.prefix, lambda m: m.group().replace(m.group(1), ''), raw, flags=re.I) stream = stream_class(raw) continue break root = parser.tree.getDocument() if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (namespaces['html'], 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{{{}}}{}'.format(XHTML_NS, 'html') or root.prefix)): raise ValueError( 'Failed to parse correctly, root has tag: {} and prefix: {}'. format(root.tag, root.prefix)) return root
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: raw = xml_replace_entities(raw) if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) if (discard_namespaces and root.tag != 'html') or ( not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) return root
def search(self, text, index, backwards=False): text = prepare_string_for_xml(text.lower()) pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) try: raw = xml_replace_entities(raw) except: pass if text in raw.lower(): return i
def fget(self): ans = u'' try: if not self.page().mainFrame().documentElement().findFirst( 'meta[name="calibre-dont-sanitize"]').isNull(): # Bypass cleanup if special meta tag exists return unicode_type(self.page().mainFrame().toHtml()) check = unicode_type( self.page().mainFrame().toPlainText()).strip() raw = unicode_type(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except: root = fromstring(raw) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [ html.tostring(x, encoding=unicode_type) for x in body if x.tag not in ('script', 'style') ] if len(elems) > 1: ans = u'<div>%s</div>' % (u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>' % ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def html(self): ans = u'' try: if not self.page().mainFrame().documentElement().findFirst('meta[name="calibre-dont-sanitize"]').isNull(): # Bypass cleanup if special meta tag exists return unicode_type(self.page().mainFrame().toHtml()) check = unicode_type(self.page().mainFrame().toPlainText()).strip() raw = unicode_type(self.page().mainFrame().toHtml()) raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] raw = self.comments_pat.sub('', raw) if not check and '<img' not in raw.lower(): return ans try: root = html.fromstring(raw) except Exception: root = parse(raw, maybe_xhtml=False, sanitize_names=True) elems = [] for body in root.xpath('//body'): if body.text: elems.append(body.text) elems += [html.tostring(x, encoding='unicode') for x in body if x.tag not in ('script', 'style')] if len(elems) > 1: ans = u'<div>%s</div>'%(u''.join(elems)) else: ans = u''.join(elems) if not ans.startswith('<'): ans = '<p>%s</p>'%ans ans = xml_replace_entities(ans) except: import traceback traceback.print_exc() return ans
def parse_html(data, log=None, decoder=None, preprocessor=None, filename="<string>", non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace("\0", "") # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = "" idx = data.find("<html") if idx == -1: idx = data.find("<HTML") has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if "<!DOCTYPE" in pre: # Handle user defined entities has_html4_doctype = re.search(r"<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>", pre) is not None # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. user_entities = {} for match in re.finditer(r"<!ENTITY\s+(\S+)\s+([^>]+)", pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r"&(%s);" % ("|".join(user_entities.keys()))) data = pat.sub(lambda m: user_entities[m.group(1)], data) data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug("Initial parse failed, using more" " forgiving parsers") raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug("Parsing %s as HTML" % filename) data = raw try: data = html5_parse(data) except: log.exception("HTML 5 parsing failed, falling back to older parsers") data = _html4_parse(data) if has_html4_doctype or data.tag == "HTML": # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != "html": if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn("File %r does not appear to be (X)HTML" % filename) nroot = etree.fromstring("<html></html>") has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == "body": has_body = True break parent = nroot if not has_body: log.warn("File %r appears to be a HTML fragment" % filename) nroot = etree.fromstring("<html><body/></html>") parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn("Forcing", filename, "into XHTML namespace") data.attrib["xmlns"] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(":=", "=").replace(":>", ">") data = data.replace("<http:/>", "") try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn("Stripping comments from %s" % filename) data = re.compile(r"<!--.*?-->", re.DOTALL).sub("", data) data = data.replace("<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", "") data = data.replace("<?xml version='1.0' encoding='utf-8'??>", "") try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn("Stripping meta tags from %s" % filename) data = re.sub(r"<meta\s+[^>]+?>", "", data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML("html"), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot fnsmap = {k: v for k, v in data.nsmap.iteritems() if v != XHTML_NS} fnsmap[None] = XHTML_NS if fnsmap != dict(data.nsmap): # Remove non default prefixes referring to the XHTML namespace data = clone_element(data, nsmap=fnsmap, in_context=False) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, "/h:html/h:head") head = head[0] if head else None if head is None: log.warn("File %s missing <head/> element" % filename) head = etree.Element(XHTML("head")) data.insert(0, head) title = etree.SubElement(head, XHTML("title")) title.text = _("Unknown") elif not xpath(data, "/h:html/h:head/h:title"): title = etree.SubElement(head, XHTML("title")) title.text = _("Unknown") # Ensure <title> is not empty title = xpath(data, "/h:html/h:head/h:title")[0] if not title.text or not title.text.strip(): title.text = _("Unknown") # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML("meta"), attrib={"http-equiv": "Content-Type"}) meta.set("content", "text/html; charset=utf-8") # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, "/h:html/h:body"): body = xpath(data, "//h:body") if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn("File %s missing <body/> element" % filename) etree.SubElement(data, XHTML("body")) # Remove microsoft office markup r = [x for x in data.iterdescendants(etree.Element) if "microsoft-com" in x.tag] for x in r: x.tag = XHTML("span") def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = "" p.text += a.tail else: if p[idx].tail is None: p[idx].tail = "" p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, "//h:a[@href]|//h:i|//h:b|//h:u"): if a.get("id", None) is None and a.get("name", None) is None and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, "//h:br"): if len(br) > 0 or br.text: br.tag = XHTML("div") # Remove any stray text in the <head> section and format it nicely data.text = "\n " head = xpath(data, "//h:head") if head: head = head[0] head.text = "\n " head.tail = "\n " for child in head: child.tail = "\n " child.tail = "\n " return data
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys()))) data = pat.sub(lambda m: user_entities[m.group(1)], data) data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or ( len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML' % filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment' % filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn('Stripping comments from %s' % filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s' % filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot fnsmap = {k: v for k, v in data.nsmap.iteritems() if v != XHTML_NS} fnsmap[None] = XHTML_NS if fnsmap != dict(data.nsmap): # Remove non default prefixes referring to the XHTML namespace data = clone_element(data, nsmap=fnsmap, in_context=False) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = _('Unknown') # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [ x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag ] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def fget(self): ans = unicode(self.editor.toPlainText()) if self.syntax == 'html': ans = xml_replace_entities(ans) return ans.encode('utf-8')
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # Process private entities user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) data = pat.sub(lambda m:user_entities[m.group(1)], data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML'%filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment'%filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn('Stripping comments from %s'% filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s'% filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot # Remove non default prefixes referring to the XHTML namespace data = ensure_namespace_prefixes(data, {None: XHTML_NS}) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = _('Unknown') # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) -1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [ exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index) ] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = xml_replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc) ] if isbsd: cmd.remove('-nodrm') p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except: pass
def fget(self): ans = self.get_raw_data() if self.syntax == "html": ans = xml_replace_entities(ans) return ans.encode("utf-8")