def test_polish_parser(self): ' Test parsing with the HTML5 parser used for polishing ' for test in basic_checks: test(self, parse) root = parse('<html><p><svg><image /><b></svg> \n<b>xxx', discard_namespaces=True) self.assertTrue(root.xpath('//b'), 'Namespaces not discarded') self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>') for ds in (False, True): src = '\n<html>\n<p>\n<svg><image />\n<b></svg> ' root = parse(src, discard_namespaces=ds) for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems(): elem = root.xpath('//*[local-name()="%s"]' % tag)[0] self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src)) for ds in (False, True): src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n' root = parse(src, discard_namespaces=ds) for tag in ('p', 'svg'): for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()): self.assertEqual(i+1, int(v)) root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:xml="http://www.w3.org/XML/1998/namespace"><body/></html>') self.assertNotIn('xmlnsU0003Axml', root.attrib, 'xml namespace declaration not removed') root = parse('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" xmlns:extra="extra"><body/></html>') self.assertIn('extra', root.nsmap, 'Extra namespace declaration on <html> tag not preserved')
def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr( getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition( '}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i
def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = etree.fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_cssutils_serialization from cssutils import CSSParser, log setup_cssutils_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding=unicode)
def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = etree.fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_css_parser_serialization from css_parser import CSSParser, log setup_css_parser_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding=unicode)
def test_cfi_decode(self): from calibre.ebooks.oeb.polish.parsing import parse root = parse(''' <html> <head></head> <body id="body01"> <p>…</p> <p>…</p> <p>…</p> <p>…</p> <p id="para05">xxx<em>yyy</em>0123456789</p> <p>…</p> <p>…</p> <img id="svgimg" src="foo.svg" alt="…"/> <p>…</p> <p><span>hello</span><span>goodbye</span>text here<em>adieu</em>text there</p> </body> </html> ''', line_numbers=True, linenumber_attribute='data-lnum') body = root[-1] def test(cfi, expected): self.assertIs(decode_cfi(root, cfi), expected) for cfi in '/4 /4[body01] /900[body01] /2[body01]'.split(): test(cfi, body) for i in range(len(body)): test('/4/{}'.format((i + 1)*2), body[i]) p = body[4] test('/4/999[para05]', p) test('/4/999[para05]/2', p[0])
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True) ).replace( '_TITLE_', prepare_string_for_xml(mi.title) ).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors)) ).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr(getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition('}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i
def test_cfi_decode(self): from calibre.ebooks.oeb.polish.parsing import parse root = parse(''' <html> <head></head> <body id="body01"> <p>…</p> <p>…</p> <p>…</p> <p>…</p> <p id="para05">xxx<em>yyy</em>0123456789</p> <p>…</p> <p>…</p> <img id="svgimg" src="foo.svg" alt="…"/> <p>…</p> <p><span>hello</span><span>goodbye</span>text here<em>adieu</em>text there</p> </body> </html> ''', line_numbers=True, linenumber_attribute='data-lnum') body = root[-1] def test(cfi, expected): self.assertIs(decode_cfi(root, cfi), expected) for cfi in '/4 /4[body01] /900[body01] /2[body01]'.split(): test(cfi, body) for i in range(len(body)): test('/4/{}'.format((i + 1) * 2), body[i]) p = body[4] test('/4/999[para05]', p) test('/4/999[para05]/2', p[0])
def parse_html(raw): root = parse(raw, decoder=lambda x: x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum') ans = serialize(root, 'text/html') if not isinstance(ans, bytes): ans = ans.encode('utf-8') return ans
def convert_epub3_nav(self, nav_path, opf, log): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX from calibre.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src':href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME) for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id)
def add_words_from_escaped_html(text, words, file_name, node, attr, locale): text = replace_entities(text) root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) ewords = defaultdict(list) ewords[None] = 0 read_words_from_html(root, ewords, file_name, locale) words[None] += ewords.pop(None) for k, locs in iteritems(ewords): for loc in locs: loc.location_node, loc.node_item = node, (False, attr) words[k].extend(locs)
def test_polish_parser(self): ' Test parsing with the HTML5 parser used for polishing ' for test in basic_checks: test(self, parse) root = parse('<html><p><svg><image /><b></svg> \n<b>xxx', discard_namespaces=True) self.assertTrue(root.xpath('//b'), 'Namespaces not discarded') self.assertFalse(root.xpath('//svg/b'), 'The <b> was not moved out of <svg>') for ds in (False, True): src = '\n<html>\n<p>\n<svg><image />\n<b></svg> ' root = parse(src, discard_namespaces=ds) for tag, lnum in {'html':2, 'head':3, 'body':3, 'p':3, 'svg':4, 'image':4, 'b':5}.iteritems(): elem = root.xpath('//*[local-name()="%s"]' % tag)[0] self.assertEqual(lnum, elem.sourceline, 'Line number incorrect for %s, source: %s:' % (tag, src)) for ds in (False, True): src = '\n<html>\n<p b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8><svg b=1 a=2 c=3 d=4 e=5 f=6 g=7 h=8>\n' root = parse(src, discard_namespaces=ds) for tag in ('p', 'svg'): for i, (k, v) in enumerate(root.xpath('//*[local-name()="%s"]' % tag)[0].items()): self.assertEqual(i+1, int(v))
def find_first_matching_rule(container, html_file_name, raw_html, class_data, lnum_attr='data-lnum'): lnum, tags = class_data['sourceline_address'] class_name = class_data['class'] root = parse(raw_html, decoder=lambda x: x.decode('utf-8'), line_numbers=True, linenumber_attribute=lnum_attr) tags_on_line = root.xpath(f'//*[@{lnum_attr}={lnum}]') barenames = [barename(tag.tag) for tag in tags_on_line] if barenames[:len(tags)] != tags: raise NoMatchingTagFound( f'No tag matching the specification was found in {html_file_name}') target_elem = tags_on_line[len(tags) - 1] select = Select(root, ignore_inappropriate_pseudo_classes=True) for tag in root.iter('*'): tn = barename(tag.tag) if tn == 'style' and tag.text and tag.get('type', 'text/css') == 'text/css': try: sheet = container.parse_css(tag.text) except Exception: continue res = find_first_rule_that_matches_elem(container, target_elem, select, class_name, sheet.cssRules, html_file_name) if res is not None: return res._replace(style_tag_address=(int(tag.get(lnum_attr)), ['style'])) elif tn == 'link' and tag.get('href') and tag.get( 'rel') == 'stylesheet': sname = container.href_to_name(tag.get('href'), html_file_name) try: sheet = container.parsed(sname) except Exception: continue if not hasattr(sheet, 'cssRules'): continue res = find_first_rule_that_matches_elem(container, target_elem, select, class_name, sheet.cssRules, sname) if res is not None: return res raise NoMatchingRuleFound( f'No CSS rules that apply to the specified tag in {html_file_name} with the class {class_name} found' )
def complete_anchor(name, data_conn): if name not in file_cache: data = raw = get_data(data_conn, 'file_data', name) if isinstance(raw, unicode_type): try: root = parse(raw, decoder=lambda x:x.decode('utf-8')) except Exception: pass else: data = (root, create_anchor_map(root)) file_cache[name] = data data = file_cache[name] if isinstance(data, tuple) and len(data) > 1 and isinstance(data[1], dict): return tuple(sorted(frozenset(data[1]), key=numeric_sort_key)), data[1], {}
def complete_anchor(name, data_conn): if name not in file_cache: data = raw = get_data(data_conn, 'file_data', name) if isinstance(raw, type('')): try: root = parse(raw, decoder=lambda x:x.decode('utf-8')) except Exception: pass else: data = (root, create_anchor_map(root)) file_cache[name] = data data = file_cache[name] if isinstance(data, tuple) and len(data) > 1 and isinstance(data[1], dict): return frozenset(data[1]), data[1], {}
def render_jacket(mi, output_profile, alt_title=_('Unknown'), alt_tags=[], alt_comments='', alt_publisher='', rescale_fonts=False, alt_authors=None): css = P('jacket/stylesheet.css', data=True).decode('utf-8') template = P('jacket/template.xhtml', data=True).decode('utf-8') template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL) css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL) try: title_str = alt_title if mi.is_null('title') else mi.title except: title_str = _('Unknown') title_str = escape(title_str) title = '<span class="title">%s</span>' % title_str series = Series(mi.series, mi.series_index) try: publisher = mi.publisher if not mi.is_null( 'publisher') else alt_publisher except: publisher = '' publisher = escape(publisher) try: if is_date_undefined(mi.pubdate): pubdate = '' else: dt = as_local_time(mi.pubdate) pubdate = strftime('%Y', dt.timetuple()) except: pubdate = '' rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char) tags = Tags((mi.tags if mi.tags else alt_tags), output_profile) comments = mi.comments if mi.comments else alt_comments comments = comments.strip() if comments: comments = comments_to_html(comments) orig = mi.authors if mi.is_null('authors'): mi.authors = list(alt_authors or (_('Unknown'), )) try: author = mi.format_authors() except: author = '' mi.authors = orig author = escape(author) has_data = {} def generate_html(comments): display = Attributes() args = dict( xmlns=XHTML_NS, title_str=title_str, identifiers=Identifiers(mi.identifiers), css=css, title=title, author=author, publisher=publisher, pubdate_label=_('Published'), pubdate=pubdate, series_label=ngettext('Series', 'Series', 1), series=series, rating_label=_('Rating'), rating=rating, tags_label=_('Tags'), tags=tags, comments=comments, footer='', display=display, searchable_tags=' '.join( escape(t) + 'ttt' for t in tags.tags_list), ) for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} try: display_name, val = mi.format_field_extended(key)[:2] dkey = key.replace('#', '_') dt = m.get('datatype') if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': args[dkey] = rating_to_stars( mi.get(key), m.get('display', {}).get('allow_half_stars', False)) elif dt == 'comments': val = val or '' ctype = m.get('display', {}).get('interpret_as') or 'html' if ctype == 'long-text': val = '<pre style="white-space:pre-wrap">%s</pre>' % escape( val) elif ctype == 'short-text': val = '<span>%s</span>' % escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: args[dkey] = escape(val) args[dkey + '_label'] = escape(display_name) setattr(display, dkey, 'none' if mi.is_null(key) else 'initial') except Exception: # if the val (custom column contents) is None, don't add to args pass if False: print("Custom column values available in jacket template:") for key in args.keys(): if key.startswith('_') and not key.endswith('_label'): print(" {}: {}".format('#' + key[1:], args[key])) # Used in the comment describing use of custom columns in templates # Don't change this unless you also change it in template.xhtml args['_genre_label'] = args.get('_genre_label', '{_genre_label}') args['_genre'] = args.get('_genre', '{_genre}') has_data['series'] = bool(series) has_data['tags'] = bool(tags) has_data['rating'] = bool(rating) has_data['pubdate'] = bool(pubdate) for k, v in has_data.items(): setattr(display, k, 'initial' if v else 'none') display.title = 'initial' if mi.identifiers: display.identifiers = 'initial' formatter = SafeFormatter() generated_html = formatter.format(template, **args) return strip_encoding_declarations(generated_html) from calibre.ebooks.oeb.polish.parsing import parse raw = generate_html(comments) root = parse(raw, line_numbers=False, force_html5_parse=True) if rescale_fonts: # We ensure that the conversion pipeline will set the font sizes for # text in the jacket to the same size as the font sizes for the rest of # the text in the book. That means that as long as the jacket uses # relative font sizes (em or %), the post conversion font size will be # the same as for text in the main book. So text with size x em will # be rescaled to the same value in both the jacket and the main content. # # We cannot use data-calibre-rescale 100 on the body tag as that will just # give the body tag a font size of 1em, which is useless. for body in root.xpath('//*[local-name()="body"]'): fw = body.makeelement(XHTML('div')) fw.set('data-calibre-rescale', '100') for child in body: fw.append(child) body.append(fw) postprocess_jacket(root, output_profile, has_data) from calibre.ebooks.oeb.polish.pretty import pretty_html_tree pretty_html_tree(None, root) return root
def test_lxml_tostring(self): ' Test for bug in some versions of lxml that causes incorrect serialization of sub-trees' from html5_parser import parse root = parse('<p>a<p>b<p>c') p = root.xpath('//p')[0] self.assertEqual(etree.tostring(p, encoding=str), '<p>a</p>')
def parse_html(raw): root = parse(raw, decoder=lambda x: x.decode("utf-8"), line_numbers=True, linenumber_attribute="data-lnum") return serialize(root, "text/html").encode("utf-8")
def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from calibre.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring( '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>' ) navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring( x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src': href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) opts.epub3_nav_href = urlnormalize( os.path.relpath(nav_path).replace(os.sep, '/')) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = os.path.relpath( os.path.join(base_path, urlunquote(href)), base_path) abs_href = urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with open(nav_path, 'wb') as f: f.write(serialize(root, 'application/xhtml+xml'))
def parse_html(raw): root = parse(raw, decoder=lambda x:x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum') ans = serialize(root, 'text/html') if not isinstance(ans, bytes): ans = ans.encode('utf-8') return ans
def parse_html(raw): root = parse(raw, decoder=lambda x:x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum') return serialize(root, 'text/html').encode('utf-8')
def parse_html(raw): root = parse(raw, decoder=lambda x: x.decode('utf-8'), line_numbers=True, linenumber_attribute='data-lnum') return serialize(root, 'text/html').encode('utf-8')
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale): text = replace_entities(text) root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) count_chars_in_html(root, counter, file_name, locale)
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' if fmt not in valid_empty_formats: raise ValueError('Cannot create empty book in the %s format' % fmt) if fmt == 'txt': with open(path, 'wb') as f: if not mi.is_null('title'): f.write(mi.title) return if fmt == 'docx': from calibre.ebooks.conversion.plumber import Plumber from calibre.ebooks.docx.writer.container import DOCX from calibre.utils.logging import default_log p = Plumber('a.docx', 'b.docx', default_log) p.setup_options() # Use the word default of one inch page margins for x in 'left right top bottom'.split(): setattr(p.opts, 'margin_' + x, 72) DOCX(p.opts, default_log).write(path, mi, create_empty_document=True) return path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True) ).replace( '_TITLE_', prepare_string_for_xml(mi.title) ).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string(mi.authors)) ).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def fix_html(raw): root = parse(raw) return serialize(root, 'text/html').decode('utf-8')
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' if fmt not in valid_empty_formats: raise ValueError('Cannot create empty book in the %s format' % fmt) if fmt == 'txt': with open(path, 'wb') as f: if not mi.is_null('title'): f.write(as_bytes(mi.title)) return if fmt == 'docx': from calibre.ebooks.conversion.plumber import Plumber from calibre.ebooks.docx.writer.container import DOCX from calibre.utils.logging import default_log p = Plumber('a.docx', 'b.docx', default_log) p.setup_options() # Use the word default of one inch page margins for x in 'left right top bottom'.split(): setattr(p.opts, 'margin_' + x, 72) DOCX(p.opts, default_log).write(path, mi, create_empty_document=True) return path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace( '_TITLE_', prepare_string_for_xml(mi.title)).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string( mi.authors))).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0o755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize from calibre.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with lopen(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): text = etree.tostring(x, method='text', encoding=unicode, with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(NCX('navPoint')) parent.append(np) np.append(np.makeelement(NCX('navLabel'))) np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: np.append(np.makeelement(NCX('content'), attrib={'src':href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(XHTML('li')): child = add_from_li(li, toc_parent) ol = first_child(li, XHTML('ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(XHTML('nav')): if nav.get(et) == 'toc': ol = first_child(nav, XHTML('ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) abs_href = urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with open(nav_path, 'wb') as f: f.write(serialize(root, 'application/xhtml+xml'))