def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = etree.fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization from css_parser import CSSParser, log setup_css_parser_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding=unicode)
def read_url(storage, url, timeout=60): with read_url_lock: if not storage: storage.append(Overseer()) scraper = storage[0] from calibre.ebooks.chardet import strip_encoding_declarations return strip_encoding_declarations(scraper.fetch_url(url, timeout=timeout))
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) if ans.tag != '{%s}html' % XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) if replace_entities: raw = xml_replace_entities(raw).replace('\0', '') # Handle � raw = raw.replace('\r\n', '\n').replace('\r', '\n') # Remove any preamble before the opening html tag as it can cause problems, # especially doctypes, preserve the original linenumbers by inserting # newlines at the start pre = raw[:2048] for match in re.finditer(r'<\s*html', pre, flags=re.I): newlines = raw.count('\n', 0, match.start()) raw = ('\n' * newlines) + raw[match.start():] break raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) if force_html5_parse: return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) try: parser = XMLParser(no_network=True) ans = fromstring(raw, parser=parser) if ans.tag != '{%s}html' % XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: for elem in ans.iter(LxmlElement): if elem.sourceline is not None: elem.set(linenumber_attribute, unicode_type(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation smartened = False for path in container.spine_items: name = container.abspath_to_name(path) changed = False with container.open(name, 'r+b') as f: html = container.decode(f.read()) newhtml = smarten_punctuation(html, container.log) if newhtml != html: changed = True report(_('Smartened punctuation in: %s') % name) newhtml = strip_encoding_declarations(newhtml) f.seek(0) f.truncate() f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) if changed: # Add an encoding declaration (it will be added automatically when # serialized) root = container.parsed(name) for m in root.xpath( 'descendant::*[local-name()="meta" and @http-equiv]'): m.getparent().remove(m) container.dirty(name) smartened = True if not smartened: report(_('No punctuation that could be smartened found')) return smartened
def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = etree.fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_cssutils_serialization from cssutils import CSSParser, log setup_cssutils_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding=unicode)
def smarten_punctuation(container, report): from calibre.ebooks.conversion.preprocess import smarten_punctuation smartened = False for path in container.spine_items: name = container.abspath_to_name(path) changed = False with container.open(name, 'r+b') as f: html = container.decode(f.read()) newhtml = smarten_punctuation(html, container.log) if newhtml != html: changed = True report(_('Smartened punctuation in: %s')%name) newhtml = strip_encoding_declarations(newhtml) f.seek(0) f.truncate() f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) if changed: # Add an encoding declaration (it will be added automatically when # serialized) root = container.parsed(name) for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'): m.getparent().remove(m) container.dirty(name) smartened = True if not smartened: report(_('No punctuation that could be smartened found')) return smartened
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='') for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key + '_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class': 'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class': 'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class': 'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='' ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: pass # Used in the comment describing use of custom columns in templates args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') generated_html = P('jacket/template.xhtml', data=True).decode('utf-8').format(**args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def get_social_metadata(title, authors, publisher, isbn, username=None, password=None): from calibre.ebooks.metadata import MetaInformation mi = MetaInformation(title, authors) if isbn: br = get_browser() try: login(br, username, password) raw = br.open_novisit('http://www.librarything.com/isbn/' +isbn).read() except: return mi if '/wiki/index.php/HelpThing:Verify' in raw: raise Exception('LibraryThing is blocking calibre.') if not raw: return mi raw = raw.decode('utf-8', 'replace') raw = strip_encoding_declarations(raw) root = html.fromstring(raw) h1 = root.xpath('//div[@class="headsummary"]/h1') if h1 and not mi.title: mi.title = html.tostring(h1[0], method='text', encoding=unicode) h2 = root.xpath('//div[@class="headsummary"]/h2/a') if h2 and not mi.authors: mi.authors = [html.tostring(x, method='text', encoding=unicode) for x in h2] h3 = root.xpath('//div[@class="headsummary"]/h3/a') if h3: match = None for h in h3: series = html.tostring(h, method='text', encoding=unicode) match = re.search(r'(.+) \((.+)\)', series) if match is not None: break if match is not None: mi.series = match.group(1).strip() match = re.search(r'[0-9.]+', match.group(2)) si = 1.0 if match is not None: si = float(match.group()) mi.series_index = si #tags = root.xpath('//div[@class="tags"]/span[@class="tag"]/a') #if tags: # mi.tags = [html.tostring(x, method='text', encoding=unicode) for x # in tags] span = root.xpath( '//table[@class="wsltable"]/tr[@class="wslcontent"]/td[4]//span') if span: raw = html.tostring(span[0], method='text', encoding=unicode) match = re.search(r'([0-9.]+)', raw) if match is not None: rating = float(match.group()) if rating > 0 and rating <= 5: mi.rating = rating return mi
def parse_html(markup): from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites from calibre.utils.cleantext import clean_xml_chars if isinstance(markup, unicode_type): markup = strip_encoding_declarations(markup) markup = substitute_entites(markup) else: markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0] markup = clean_xml_chars(markup) from html5_parser.soup import parse return parse(markup, return_root=False)
def save_html(browser, output_dir, postprocess_html, url, recursion_level): html = strip_encoding_declarations(browser.html) import html5lib root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: # user wants this page to be aborted raise AbortFetch('%s was aborted during postprocess' % url) with open(os.path.join(output_dir, 'index.html'), 'wb') as f: from lxml.html import tostring f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True)) return f.name
def save_html(browser, output_dir, postprocess_html, url, recursion_level): import html5lib from calibre.utils.cleantext import clean_xml_chars html = strip_encoding_declarations(browser.html) if isinstance(html, unicode): html = clean_xml_chars(html) root = html5lib.parse(html, treebuilder='lxml', namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: # user wants this page to be aborted raise AbortFetch('%s was aborted during postprocess' % url) with open(os.path.join(output_dir, 'index.html'), 'wb') as f: from lxml.html import tostring f.write(tostring(root, include_meta_content_type=True, encoding='utf-8', pretty_print=True)) return f.name
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/katalog-ebooki?query=' + urllib.quote_plus(query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' counter = max_results try: results = fork_job(js_browser,'get_results', (url, timeout,), module_is_source_code=True) except WorkerError as e: raise Exception('Could not get results: %s'%e.orig_tb) doc = html.fromstring(strip_encoding_declarations(results['result'])) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka"]'): if counter <= 0: break id = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href')) if not id: continue cover_url = ''.join(data.xpath('.//div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src')) title = ''.join(data.xpath('.//h2[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()')) author = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()')) price = ''.join(data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()')) formats = ', '.join(data.xpath('.//p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()')) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def expand_mobi8_markup(mobi8_reader, resource_map, log): # First update all internal links that are based on offsets parts = update_internal_links(mobi8_reader, log) # Remove pointless markup inserted by kindlegen remove_kindlegen_markup(parts, mobi8_reader.aid_anchor_suffix, mobi8_reader.linked_aids) # Handle substitutions for the flows pieces first as they may # be inlined into the xhtml text flows = update_flow_links(mobi8_reader, resource_map, log) # Insert inline flows into the markup insert_flows_into_markup(parts, flows, mobi8_reader, log) # Insert raster images into markup insert_images_into_markup(parts, resource_map, log) # Perform general markup cleanups upshift_markup(parts) # Update the parts and flows stored in the reader mobi8_reader.parts = parts mobi8_reader.flows = flows # write out the parts and file flows os.mkdir('text') # directory containing all parts spine = [] for i, part in enumerate(parts): pi = mobi8_reader.partinfo[i] with open(os.path.join(pi.type, pi.filename), 'wb') as f: part = strip_encoding_declarations(part) part = part.replace('<head>', '<head><meta charset="UTF-8"/>', 1) f.write(part.encode('utf-8')) spine.append(f.name) for i, flow in enumerate(flows): fi = mobi8_reader.flowinfo[i] if fi.format == 'file': if not os.path.exists(fi.dir): os.mkdir(fi.dir) with open(os.path.join(fi.dir, fi.fname), 'wb') as f: if fi.fname.endswith('.css') and '@media' in flow: flow = handle_media_queries(flow) f.write(flow.encode('utf-8')) return spine
def save_html(browser, output_dir, postprocess_html, url, recursion_level): import html5lib from calibre.utils.cleantext import clean_xml_chars html = strip_encoding_declarations(browser.html) if isinstance(html, unicode): html = clean_xml_chars(html) root = html5lib.parse(html, treebuilder="lxml", namespaceHTMLElements=False).getroot() root = postprocess_html(root, url, recursion_level) if root is None: # user wants this page to be aborted raise AbortFetch("%s was aborted during postprocess" % url) with open(os.path.join(output_dir, "index.html"), "wb") as f: from lxml.html import tostring f.write(tostring(root, include_meta_content_type=True, encoding="utf-8", pretty_print=True)) return f.name
def index_to_soup(self, url_or_raw, raw=False): ''' Convenience method that takes an URL to the index page and returns a parsed lxml tree representation of it. See http://lxml.de/tutorial.html `url_or_raw`: Either a URL or the downloaded index page as a string ''' if re.match(r'\w+://', url_or_raw): self.jsbrowser.start_load(url_or_raw) html = self.jsbrowser.html else: html = url_or_raw if isinstance(html, bytes): html = xml_to_unicode(html)[0] html = strip_encoding_declarations(html) if raw: return html import html5lib root = html5lib.parse(clean_xml_chars(html), treebuilder='lxml', namespaceHTMLElements=False).getroot() return root
def search(self, query, max_results=10, timeout=60): url = 'http://woblink.com/ebooki-kategorie?query=' + urllib.quote_plus( query.encode('utf-8')) if max_results > 10: if max_results > 20: url += '&limit=30' else: url += '&limit=20' counter = max_results try: results = fork_job(js_browser, 'get_results', ( url, timeout, ), module_is_source_code=True) except WorkerError as e: raise Exception('Could not get results: %s' % e.orig_tb) doc = html.fromstring(strip_encoding_declarations(results['result'])) for data in doc.xpath('//div[@class="nw_katalog_lista_ksiazka "]'): if counter <= 0: break id = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/@href' )) if not id: continue cover_url = ''.join( data.xpath( './/div[@class="nw_katalog_lista_ksiazka_okladka nw_okladka"]/a[1]/img/@src' )) title = ''.join( data.xpath( './/h3[@class="nw_katalog_lista_ksiazka_detale_tytul"]/a[1]/text()' )) author = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_autor"]/a/text()' )) price = ''.join( data.xpath('.//div[@class="nw_opcjezakupu_cena"]/text()')) formats = ', '.join( data.xpath( './/p[@class="nw_katalog_lista_ksiazka_detale_format"]/span/text()' )) s = SearchResult() s.cover_url = 'http://woblink.com' + cover_url s.title = title.strip() s.author = author.strip() s.price = price + ' zł' s.detail_item = id.strip() s.formats = formats if 'DRM' in formats: s.drm = SearchResult.DRM_LOCKED counter -= 1 yield s else: s.drm = SearchResult.DRM_UNLOCKED counter -= 1 yield s
def generate_html(comments): display = Attributes() args = dict( xmlns=XHTML_NS, title_str=title_str, identifiers=Identifiers(mi.identifiers), css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=ngettext('Series', 'Series', 1), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', display=display, searchable_tags=' '.join( escape(t) + 'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars( mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' ctype = m.get('display', {}).get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape( val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey + '_label'] = escape(display_name) setattr(display, dkey, 'none' if mi.is_null(key) else 'initial') except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" {}: {}".format('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') has_data['series'] = bool(series) has_data['tags'] = bool(tags) has_data['rating'] = bool(rating) has_data['pubdate'] = bool(pubdate) for k, v in has_data.items(): setattr(display, k, 'initial' if v else 'none') display.title = 'initial' if mi.identifiers: display.identifiers = 'initial' formatter = SafeFormatter() generated_html = formatter.format(template, **args) return strip_encoding_declarations(generated_html)
def parse_html(data, log=None, decoder=None, preprocessor=None, filename="<string>", non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace("\0", "") # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = "" idx = data.find("<html") if idx == -1: idx = data.find("<HTML") has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if "<!DOCTYPE" in pre: # Handle user defined entities has_html4_doctype = re.search(r"<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>", pre) is not None # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. user_entities = {} for match in re.finditer(r"<!ENTITY\s+(\S+)\s+([^>]+)", pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r"&(%s);" % ("|".join(user_entities.keys()))) data = pat.sub(lambda m: user_entities[m.group(1)], data) data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug("Initial parse failed, using more" " forgiving parsers") raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug("Parsing %s as HTML" % filename) data = raw try: data = html5_parse(data) except: log.exception("HTML 5 parsing failed, falling back to older parsers") data = _html4_parse(data) if has_html4_doctype or data.tag == "HTML": # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != "html": if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn("File %r does not appear to be (X)HTML" % filename) nroot = etree.fromstring("<html></html>") has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == "body": has_body = True break parent = nroot if not has_body: log.warn("File %r appears to be a HTML fragment" % filename) nroot = etree.fromstring("<html><body/></html>") parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn("Forcing", filename, "into XHTML namespace") data.attrib["xmlns"] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(":=", "=").replace(":>", ">") data = data.replace("<http:/>", "") try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn("Stripping comments from %s" % filename) data = re.compile(r"<!--.*?-->", re.DOTALL).sub("", data) data = data.replace("<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", "") data = data.replace("<?xml version='1.0' encoding='utf-8'??>", "") try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn("Stripping meta tags from %s" % filename) data = re.sub(r"<meta\s+[^>]+?>", "", data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML("html"), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot fnsmap = {k: v for k, v in data.nsmap.iteritems() if v != XHTML_NS} fnsmap[None] = XHTML_NS if fnsmap != dict(data.nsmap): # Remove non default prefixes referring to the XHTML namespace data = clone_element(data, nsmap=fnsmap, in_context=False) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, "/h:html/h:head") head = head[0] if head else None if head is None: log.warn("File %s missing <head/> element" % filename) head = etree.Element(XHTML("head")) data.insert(0, head) title = etree.SubElement(head, XHTML("title")) title.text = _("Unknown") elif not xpath(data, "/h:html/h:head/h:title"): title = etree.SubElement(head, XHTML("title")) title.text = _("Unknown") # Ensure <title> is not empty title = xpath(data, "/h:html/h:head/h:title")[0] if not title.text or not title.text.strip(): title.text = _("Unknown") # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML("meta"), attrib={"http-equiv": "Content-Type"}) meta.set("content", "text/html; charset=utf-8") # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, "/h:html/h:body"): body = xpath(data, "//h:body") if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn("File %s missing <body/> element" % filename) etree.SubElement(data, XHTML("body")) # Remove microsoft office markup r = [x for x in data.iterdescendants(etree.Element) if "microsoft-com" in x.tag] for x in r: x.tag = XHTML("span") def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = "" p.text += a.tail else: if p[idx].tail is None: p[idx].tail = "" p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, "//h:a[@href]|//h:i|//h:b|//h:u"): if a.get("id", None) is None and a.get("name", None) is None and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, "//h:br"): if len(br) > 0 or br.text: br.tag = XHTML("div") # Remove any stray text in the <head> section and format it nicely data.text = "\n " head = xpath(data, "//h:head") if head: head = head[0] head.text = "\n " head.tail = "\n " for child in head: child.tail = "\n " child.tail = "\n " return data
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys()))) data = pat.sub(lambda m: user_entities[m.group(1)], data) data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or ( len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML' % filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment' % filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn('Stripping comments from %s' % filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s' % filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot fnsmap = {k: v for k, v in data.nsmap.iteritems() if v != XHTML_NS} fnsmap[None] = XHTML_NS if fnsmap != dict(data.nsmap): # Remove non default prefixes referring to the XHTML namespace data = clone_element(data, nsmap=fnsmap, in_context=False) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = _('Unknown') # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [ x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag ] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) - 1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace('#', '_') args[key] = escape(val) args[key+'_label'] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def get_social_metadata(title, authors, publisher, isbn, username=None, password=None): from calibre.ebooks.metadata import MetaInformation mi = MetaInformation(title, authors) if isbn: br = get_browser() try: login(br, username, password) raw = br.open_novisit('http://www.librarything.com/isbn/' + isbn).read() except: return mi if '/wiki/index.php/HelpThing:Verify' in raw: raise Exception('LibraryThing is blocking calibre.') if not raw: return mi raw = raw.decode('utf-8', 'replace') raw = strip_encoding_declarations(raw) root = html.fromstring(raw) h1 = root.xpath('//div[@class="headsummary"]/h1') if h1 and not mi.title: mi.title = html.tostring(h1[0], method='text', encoding=unicode) h2 = root.xpath('//div[@class="headsummary"]/h2/a') if h2 and not mi.authors: mi.authors = [ html.tostring(x, method='text', encoding=unicode) for x in h2 ] h3 = root.xpath('//div[@class="headsummary"]/h3/a') if h3: match = None for h in h3: series = html.tostring(h, method='text', encoding=unicode) match = re.search(r'(.+) \((.+)\)', series) if match is not None: break if match is not None: mi.series = match.group(1).strip() match = re.search(r'[0-9.]+', match.group(2)) si = 1.0 if match is not None: si = float(match.group()) mi.series_index = si #tags = root.xpath('//div[@class="tags"]/span[@class="tag"]/a') #if tags: # mi.tags = [html.tostring(x, method='text', encoding=unicode) for x # in tags] span = root.xpath( '//table[@class="wsltable"]/tr[@class="wslcontent"]/td[4]//span') if span: raw = html.tostring(span[0], method='text', encoding=unicode) match = re.search(r'([0-9.]+)', raw) if match is not None: rating = float(match.group()) if rating > 0 and rating <= 5: mi.rating = rating return mi
def generate_html(comments): args = dict( xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_("Published"), pubdate=pubdate, series_label=_("Series"), series=series, rating_label=_("Rating"), rating=rating, tags_label=_("Tags"), tags=tags, comments=comments, footer="", ) for key in mi.custom_field_keys(): try: display_name, val = mi.format_field_extended(key)[:2] key = key.replace("#", "_") args[key] = escape(val) args[key + "_label"] = escape(display_name) except: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith("_") and not key.endswith("_label"): print(" %s: %s" % ("#" + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args["_genre_label"] = args.get("_genre_label", "{_genre_label}") args["_genre"] = args.get("_genre", "{_genre}") formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={"class": "cbj_series"}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={"class": "cbj_rating"}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={"class": "cbj_tags"}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={"class": "cbj_pubdata"}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != "kindle": hr_tag = soup.find("hr", attrs={"class": "cbj_kindle_banner_hr"}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations(soup.renderContents("utf-8").decode("utf-8"))
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace(u'\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations(self.processed_html) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def extract_content(self, output_dir, parse_cache): output_dir = os.path.abspath(output_dir) self.check_for_drm() processed_records = self.extract_text() if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() self.processed_html = self.processed_html.decode(self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html) self.processed_html = self.processed_html.replace('\ufeff', '') # Remove tags of the form <xyz: ...> as they can cause issues further # along the pipeline self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html) image_name_map = self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() self.log.debug('Parsing HTML...') self.processed_html = clean_xml_chars(self.processed_html) try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: root = html.fromstring(self.processed_html.replace('\x0c', '').replace('\x14', '')) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') self.processed_html = strip_encoding_declarations(self.processed_html) # These trip up the html5 parser causing all content to be placed # under the <guide> tag self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I) self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I) try: root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) except Exception: self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray </html>s in the markup self.processed_html = self.processed_html.replace('</html>', '') root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening <html> tag') nroot = html.fromstring('<html><head></head><body></body></html>') bod = nroot.find('body') for child in list(root): child.getparent().remove(child) bod.append(child) root = nroot htmls = list(root.xpath('//html')) if len(htmls) > 1: self.log.warn('Markup contains multiple <html> tags, merging.') # Merge all <head> and <body> sections for h in htmls: p = h.getparent() if hasattr(p, 'remove'): p.remove(h) bodies, heads = root.xpath('//body'), root.xpath('//head') for x in root: root.remove(x) head, body = map(root.makeelement, ('head', 'body')) for h in heads: for x in h: h.remove(x) head.append(x) for b in bodies: for x in b: b.remove(x) body.append(x) root.append(head), root.append(body) for x in root.xpath('//script'): x.getparent().remove(x) head = root.xpath('//head') if head: head = head[0] else: head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' link = head.makeelement('link', {'type':'text/css', 'href':'styles.css', 'rel':'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') m = head.makeelement('meta', {'http-equiv':'Content-Type', 'content':'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) try: title.text = self.book_header.title except ValueError: title.text = clean_ascii_chars(self.book_header.title) title.tail = '\n\t' head.insert(0, title) head.text = '\n\t' self.upshift_markup(root, image_name_map) guides = root.xpath('//guide') guide = guides[0] if guides else None metadata_elems = root.xpath('//metadata') if metadata_elems and self.book_header.exth is None: self.read_embedded_metadata(root, metadata_elems[0], guide) for elem in guides + metadata_elems: elem.getparent().remove(elem) htmlfile = os.path.join(output_dir, 'index.html') try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] except AttributeError: pass def write_as_utf8(path, data): if isinstance(data, unicode_type): data = data.encode('utf-8') with lopen(path, 'wb') as f: f.write(data) parse_cache[htmlfile] = root self.htmlfile = htmlfile ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(lopen(self.created_opf_path, 'wb'), ncx, ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') write_as_utf8(ncx_path, ncx) css = [self.base_css_rules, '\n\n'] for cls, rule in self.tag_css_rules.items(): css.append('.%s { %s }\n\n' % (cls, rule)) write_as_utf8('styles.css', ''.join(css)) if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
def parse_html(data, log=None, decoder=None, preprocessor=None, filename='<string>', non_html_file_tags=frozenset()): if log is None: from calibre.utils.logging import default_log log = default_log filename = force_unicode(filename, enc=filesystem_encoding) if not isinstance(data, unicode): if decoder is not None: data = decoder(data) else: data = xml_to_unicode(data)[0] data = strip_encoding_declarations(data) # Remove DOCTYPE declaration as it messes up parsing # In particular, it causes tostring to insert xmlns # declarations, which messes up the coercing logic pre = '' idx = data.find('<html') if idx == -1: idx = data.find('<HTML') has_html4_doctype = False if idx > -1: pre = data[:idx] data = data[idx:] if '<!DOCTYPE' in pre: # Handle user defined entities # kindlegen produces invalid xhtml with uppercase attribute names # if fed HTML 4 with uppercase attribute names, so try to detect # and compensate for that. has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None # Process private entities user_entities = {} for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre): val = match.group(2) if val.startswith('"') and val.endswith('"'): val = val[1:-1] user_entities[match.group(1)] = val if user_entities: pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) data = pat.sub(lambda m:user_entities[m.group(1)], data) if preprocessor is not None: data = preprocessor(data) # There could be null bytes in data if it had � entities in it data = data.replace('\0', '') data = raw = clean_word_doc(data, log) # Setting huge_tree=True causes crashes in windows with large files parser = etree.XMLParser(no_network=True) # Try with more & more drastic measures to parse try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: data = etree.fromstring(data, parser=parser) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) data = raw try: data = html5_parse(data) except Exception: log.exception( 'HTML 5 parsing failed, falling back to older parsers') data = _html4_parse(data) if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))): # Lower case all tag and attribute names data.tag = data.tag.lower() for x in data.iterdescendants(): try: x.tag = x.tag.lower() for key, val in list(x.attrib.iteritems()): del x.attrib[key] key = key.lower() x.attrib[key] = val except: pass if barename(data.tag) != 'html': if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML'%filename) nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (unicode, str)) and barename(child.tag) == 'body': has_body = True break parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment'%filename) nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() if oparent is not None: oparent.remove(child) parent.append(child) data = nroot # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') data.attrib['xmlns'] = XHTML_NS data = etree.tostring(data, encoding=unicode) try: data = etree.fromstring(data, parser=parser) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: data = etree.fromstring(data, parser=parser) except etree.XMLSyntaxError: log.warn('Stripping comments from %s'% filename) data = re.compile(r'<!--.*?-->', re.DOTALL).sub('', data) data = data.replace( "<?xml version='1.0' encoding='utf-8'?><o:p></o:p>", '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: data = etree.fromstring(data, parser=RECOVER_PARSER) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s'% filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) data = etree.fromstring(data, parser=RECOVER_PARSER) elif namespace(data.tag) != XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), nsmap={None: XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, basestring) and \ namespace(elem.tag) == ns: elem.tag = XHTML(barename(elem.tag)) for elem in data: nroot.append(elem) data = nroot # Remove non default prefixes referring to the XHTML namespace data = ensure_namespace_prefixes(data, {None: XHTML_NS}) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing <head/> element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = _('Unknown') # Ensure <title> is not empty title = xpath(data, '/h:html/h:head/h:title')[0] if not title.text or not title.text.strip(): title.text = _('Unknown') # Remove any encoding-specifying <meta/> elements for meta in META_XP(data): meta.getparent().remove(meta) meta = etree.SubElement(head, XHTML('meta'), attrib={'http-equiv': 'Content-Type'}) meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute # Ensure has a <body/> if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing <body/> element' % filename) etree.SubElement(data, XHTML('body')) # Remove microsoft office markup r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag] for x in r: x.tag = XHTML('span') def remove_elem(a): p = a.getparent() idx = p.index(a) -1 p.remove(a) if a.tail: if idx < 0: if p.text is None: p.text = '' p.text += a.tail else: if p[idx].tail is None: p[idx].tail = '' p[idx].tail += a.tail # Remove hyperlinks with no content as they cause rendering # artifacts in browser based renderers # Also remove empty <b>, <u> and <i> tags for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'): if a.get('id', None) is None and a.get('name', None) is None \ and len(a) == 0 and not a.text: remove_elem(a) # Convert <br>s with content into paragraphs as ADE can't handle # them for br in xpath(data, '//h:br'): if len(br) > 0 or br.text: br.tag = XHTML('div') # Remove any stray text in the <head> section and format it nicely data.text = '\n ' head = xpath(data, '//h:head') if head: head = head[0] head.text = '\n ' head.tail = '\n ' for child in head: child.tail = '\n ' child.tail = '\n ' return data
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey+'_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class':'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class':'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class':'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class':'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class':'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))
def generate_html(comments): args = dict(xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join(escape(t)+'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey+'_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) has_data['series'] = bool(series) has_data['tags'] = bool(tags) has_data['rating'] = bool(rating) has_data['pubdate'] = bool(pubdate) return strip_encoding_declarations(generated_html)
def generate_html(comments): args = dict( xmlns=XHTML_NS, title_str=title_str, css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=_('Series'), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', searchable_tags=' '.join( escape(t) + 'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars( mi.get(key), m.get('display', {}).get('allow_half_stars', False)) else: args[dkey] = escape(val) args[dkey + '_label'] = escape(display_name) except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" %s: %s" % ('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') formatter = SafeFormatter() generated_html = formatter.format(template, **args) # Post-process the generated html to strip out empty header items soup = BeautifulSoup(generated_html) if not series: series_tag = soup.find(attrs={'class': 'cbj_series'}) if series_tag is not None: series_tag.extract() if not rating: rating_tag = soup.find(attrs={'class': 'cbj_rating'}) if rating_tag is not None: rating_tag.extract() if not tags: tags_tag = soup.find(attrs={'class': 'cbj_tags'}) if tags_tag is not None: tags_tag.extract() if not pubdate: pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'}) if pubdate_tag is not None: pubdate_tag.extract() if output_profile.short_name != 'kindle': hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'}) if hr_tag is not None: hr_tag.extract() return strip_encoding_declarations( soup.renderContents('utf-8').decode('utf-8'))