def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def serialize_elem(self, elem, item, nsrmap=NSRMAP): buf = self.buf if not isinstance(elem.tag, basestring) \ or namespace(elem.tag) not in nsrmap: return tag = prefixname(elem.tag, nsrmap) # Previous layers take care of @name id_ = elem.attrib.pop('id', None) if id_: href = '#'.join((item.href, id_)) offset = self.anchor_offset or buf.tell() key = urlnormalize(href) # Only set this id_offset if it wasn't previously seen self.id_offsets[key] = self.id_offsets.get(key, offset) if self.anchor_offset is not None and \ tag == 'a' and not elem.attrib and \ not len(elem) and not elem.text: return self.anchor_offset = buf.tell() buf.write(b'<') buf.write(tag.encode('utf-8')) if elem.attrib: for attr, val in elem.attrib.items(): if namespace(attr) not in nsrmap: continue attr = prefixname(attr, nsrmap) buf.write(b' ') if attr == 'href': if self.serialize_href(val, item): continue elif attr == 'src': href = urlnormalize(item.abshref(val)) if href in self.images: index = self.images[href] self.used_images.add(href) buf.write(b'recindex="%05d"' % index) continue buf.write(attr.encode('utf-8')) buf.write(b'="') self.serialize_text(val, quot=True) buf.write(b'"') buf.write(b'>') if elem.text or len(elem) > 0: if elem.text: self.anchor_offset = None self.serialize_text(elem.text) for child in elem: self.serialize_elem(child, item) if child.tail: self.anchor_offset = None self.serialize_text(child.tail) buf.write(b'</%s>' % tag.encode('utf-8'))
def __call__(self, oeb, context): import cssutils oeb.logger.info('Trimming unused files from manifest...') self.opts = context used = set() for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[item.value]) elif item.value in oeb.manifest.ids: used.add(oeb.manifest.ids[item.value]) for ref in oeb.guide.values(): path, _ = urldefrag(ref.href) if path in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[path]) # TOC items are required to be in the spine for item in oeb.spine: used.add(item) unchecked = used while unchecked: new = set() for item in unchecked: if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) elif item.media_type == CSS_MIME: for href in cssutils.getUrls(item.data): href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) used.update(new) unchecked = new for item in oeb.manifest.values(): if item not in used: oeb.logger.info('Trimming %r from manifest' % item.href) oeb.manifest.remove(item)
def handle_embedded_fonts(self): ''' Make sure all fonts are embeddable. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw)
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) for item in sorted(extras): if version >= 2: self.logger.warn( 'Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def find_previous_calibre_inline_toc(oeb): if "toc" in oeb.guide: href = urlnormalize(oeb.guide["toc"].href.partition("#")[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if hasattr(item.data, "xpath") and XPath('//h:body[@id="calibre_generated_inline_toc"]')(item.data): return item
def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def serialize_href(self, href, base=None): ''' Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. ''' hrefs = self.oeb.manifest.hrefs try: path, frag = urldefrag(urlnormalize(href)) except ValueError: # Unparseable URL return False if path and base: path = base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False path = item.href if item else base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) buf.write(b'0000000000') return True
def __call__(self, oeb, opts): import cssutils self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): cssutils.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in self.oeb.guide.values(): href = urlnormalize(ref.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def find_previous_calibre_inline_toc(oeb): if 'toc' in oeb.guide: href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:body[@id="calibre_generated_inline_toc"]')(item.data)): return item
def process_fonts(self): ''' Make sure all fonts are embeddable ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw)
def find_font_face_rules(sheet, oeb): ''' Find all @font-face rules in the given sheet and extract the relevant info from them. sheet can be either a ManifestItem or a CSSStyleSheet. ''' ans = [] try: rules = sheet.data.cssRules except AttributeError: rules = sheet.cssRules for i, rule in enumerate(rules): if rule.type != rule.FONT_FACE_RULE: continue props = get_font_properties(rule, default='normal') if not props['font-family'] or not props['src']: continue try: path = sheet.abshref(props['src']) except AttributeError: path = props['src'] ff = oeb.manifest.hrefs.get(urlnormalize(path), None) if not ff: continue props['item'] = ff if props['font-weight'] in {'bolder', 'lighter'}: props['font-weight'] = '400' props['weight'] = int(props['font-weight']) props['rule'] = rule props['chars'] = set() ans.append(props) return ans
def find_embedded_fonts(self): """ Find all @font-face rules and extract the relevant info from them. """ self.embedded_fonts = [] for item in self.oeb.manifest: if not hasattr(item.data, "cssRules"): continue for i, rule in enumerate(item.data.cssRules): if rule.type != rule.FONT_FACE_RULE: continue props = self.get_font_properties(rule, default="normal") if not props["font-family"] or not props["src"]: continue path = item.abshref(props["src"]) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if not ff: continue props["item"] = ff if props["font-weight"] in {"bolder", "lighter"}: props["font-weight"] = "400" props["weight"] = int(props["font-weight"]) props["chars"] = set() props["rule"] = rule self.embedded_fonts.append(props)
def find_embedded_fonts(self): ''' Find all @font-face rules and extract the relevant info from them. ''' self.embedded_fonts = [] for item in self.oeb.manifest: if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type != rule.FONT_FACE_RULE: continue props = self.get_font_properties(rule, default='normal') if not props['font-family'] or not props['src']: continue path = item.abshref(props['src']) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if not ff: continue props['item'] = ff if props['font-weight'] in {'bolder', 'lighter'}: props['font-weight'] = '400' props['weight'] = int(props['font-weight']) props['chars'] = set() props['rule'] = rule self.embedded_fonts.append(props)
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) for item in sorted(extras): if version >= 2: self.logger.warn('Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc or getattr(opts, 'mobi_passthrough', False)): return self.log.info('\tGenerating in-line ToC') embed_css = '' s = getattr(oeb, 'store_embed_font_rules', None) if getattr(s, 'body_font_family', None): css = [x.cssText for x in s.rules ] + ['body { font-family: %s }' % s.body_font_family] embed_css = '\n\n'.join(css) root = etree.fromstring( TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or ''))) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def __call__(self, oeb, context): has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False) if 'toc' in oeb.guide: # Ensure toc pointed to in <guide> is in spine from calibre.ebooks.oeb.base import urlnormalize href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) return elif has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not has_toc: return oeb.logger.info('Generating in-line TOC...') title = self.title or oeb.translate(DEFAULT_TITLE) style = self.style if style not in STYLE_CSS: oeb.logger.error('Unknown TOC style %r' % style) style = 'nested' id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) language = unicode_type(oeb.metadata.language[0]) contents = element(None, XHTML('html'), nsmap={None: XHTML_NS}, attrib={XML('lang'): language}) head = element(contents, XHTML('head')) htitle = element(head, XHTML('title')) htitle.text = title element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, href=css_href) body = element(contents, XHTML('body'), attrib={'class': 'calibre_toc'}) h1 = element(body, XHTML('h2'), attrib={'class': 'calibre_toc_header'}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) oeb.guide.add('toc', 'Table of Contents', href)
def find_previous_calibre_inline_toc(oeb): if 'toc' in oeb.guide: href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:body[@id="calibre_generated_inline_toc"]')( item.data)): return item
def rewrite_link(self, url, page=None): if not page: return url abs_url = page.abshref(urlnormalize(url)) if abs_url in self.images: return 'images/%s' % self.images[abs_url] if abs_url in self.links: return self.links[abs_url] return url
def rasterize_item(self, item): html = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(html, '//h:img[@src]'): src = urlnormalize(elem.attrib['src']) image = hrefs.get(item.abshref(src), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME): data = urlnormalize(elem.attrib['data']) image = hrefs.get(item.abshref(data), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//svg:svg'): style = self.stylizer(item).style(elem) self.rasterize_inline(elem, style, item)
def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode).strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def inspect_cover(self, href): from calibre.ebooks.oeb.base import urlnormalize for x in self.oeb.manifest: if x.href == urlnormalize(href): try: raw = x.data return identify_data(raw)[:2] except: self.log.exception('Failed to read image dimensions') return None, None
def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc or getattr(opts, 'mobi_passthrough', False)): return self.log('\tGenerating in-line ToC') embed_css = '' s = getattr(oeb, 'store_embed_font_rules', None) if getattr(s, 'body_font_family', None): css = [x.cssText for x in s.rules] + [ 'body { font-family: %s }'%s.body_font_family] embed_css = '\n\n'.join(css) root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or ''))) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def handle_embedded_fonts(self): ''' On windows, Qt uses GDI which does not support OpenType (CFF) fonts, so we need to nuke references to OpenType fonts. Qt's directwrite text backend is not mature. Also make sure all fonts are embeddable. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction from PyQt5.Qt import QByteArray, QRawFont font_warnings = set() processed = set() is_cff = {} for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue remove = set() for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) if iswindows: if path not in is_cff: f = QRawFont(QByteArray(nraw), 12) is_cff[path] = f.isValid() and len( f.fontTable('head')) == 0 if is_cff[path]: if path not in font_warnings: font_warnings.add(path) self.log.warn( 'CFF OpenType fonts are not supported on windows, ignoring: %s' % path) remove.add(i) for i in sorted(remove, reverse=True): item.data.cssRules.pop(i)
def __call__(self, oeb, context): has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False) if 'toc' in oeb.guide: # Ensure toc pointed to in <guide> is in spine from calibre.ebooks.oeb.base import urlnormalize href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) return elif has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not has_toc: return oeb.logger.info('Generating in-line TOC...') title = self.title or oeb.translate(DEFAULT_TITLE) style = self.style if style not in STYLE_CSS: oeb.logger.error('Unknown TOC style %r' % style) style = 'nested' id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) language = str(oeb.metadata.language[0]) contents = element(None, XHTML('html'), nsmap={None: XHTML_NS}, attrib={XML('lang'): language}) head = element(contents, XHTML('head')) htitle = element(head, XHTML('title')) htitle.text = title element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, href=css_href) body = element(contents, XHTML('body'), attrib={'class': 'calibre_toc'}) h1 = element(body, XHTML('h2'), attrib={'class': 'calibre_toc_header'}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) oeb.guide.add('toc', 'Table of Contents', href)
def rename_requested(self, oldname, newname): self.commit_all_editors_to_container() if guess_type(oldname) != guess_type(newname): args = os.path.splitext(oldname) + os.path.splitext(newname) if not confirm( _( "You are changing the file type of {0}<b>{1}</b> to {2}<b>{3}</b>." " Doing so can cause problems, are you sure?" ).format(*args), "confirm-filetype-change", parent=self.gui, title=_("Are you sure?"), config_set=tprefs, ): return if urlnormalize(newname) != newname: if not confirm( _( "The name you have chosen {0} contains special characters, internally" " it will look like: {1}Try to use only the English alphabet [a-z], numbers [0-9]," " hyphens and underscores for file names. Other characters can cause problems for " " different ebook viewers. Are you sure you want to proceed?" ).format("<pre>%s</pre>" % newname, "<pre>%s</pre>" % urlnormalize(newname)), "confirm-urlunsafe-change", parent=self.gui, title=_("Are you sure?"), config_set=tprefs, ): return self.add_savepoint(_("Rename %s") % oldname) name_map = {oldname: newname} self.gui.blocking_job( "rename_file", _("Renaming and updating links..."), partial(self.rename_done, name_map), rename_files, current_container(), name_map, )
def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx-1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s'%(idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s'%idx return oref
def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx - 1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s' % ( idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s' % idx return oref
def process_fonts(self): ''' Make sure all fonts are embeddable. Also remove some fonts that cause problems. ''' from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) elif iswindows and rule.type == rule.STYLE_RULE: from tinycss.fonts3 import parse_font_family, serialize_font_family s = rule.style f = s.getProperty(u'font-family') if f is not None: font_families = parse_font_family( f.propertyValue.cssText) ff = [ x for x in font_families if x.lower() != u'courier' ] if len(ff) != len(font_families): if 'courier' not in self.filtered_font_warnings: # See https://bugs.launchpad.net/bugs/1665835 self.filtered_font_warnings.add(u'courier') self.log.warn( u'Removing courier font family as it does not render on windows' ) f.propertyValue.cssText = serialize_font_family( ff or [u'monospace'])
def fix_toc_entry(self, toc): if toc.href: href = urlnormalize(toc.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: self.fix_toc_entry(x)
def handle_embedded_fonts(self): """ On windows, Qt uses GDI which does not support OpenType (CFF) fonts, so we need to nuke references to OpenType fonts. Qt's directwrite text backend is not mature. Also make sure all fonts are embeddable. """ from calibre.ebooks.oeb.base import urlnormalize from calibre.utils.fonts.utils import remove_embed_restriction from PyQt5.Qt import QByteArray, QRawFont font_warnings = set() processed = set() is_cff = {} for item in list(self.oeb.manifest): if not hasattr(item.data, "cssRules"): continue remove = set() for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty("src").propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw) if iswindows: if path not in is_cff: f = QRawFont(QByteArray(nraw), 12) is_cff[path] = f.isValid() and len(f.fontTable("head")) == 0 if is_cff[path]: if path not in font_warnings: font_warnings.add(path) self.log.warn("CFF OpenType fonts are not supported on windows, ignoring: %s" % path) remove.add(i) for i in sorted(remove, reverse=True): item.data.cssRules.pop(i)
def rename_requested(self, oldname, newname): self.commit_all_editors_to_container() if guess_type(oldname) != guess_type(newname): args = os.path.splitext(oldname) + os.path.splitext(newname) if not confirm( _('You are changing the file type of {0}<b>{1}</b> to {2}<b>{3}</b>.' ' Doing so can cause problems, are you sure?').format(*args), 'confirm-filetype-change', parent=self.gui, title=_('Are you sure?'), config_set=tprefs): return if urlnormalize(newname) != newname: if not confirm( _('The name you have chosen {0} contains special characters, internally' ' it will look like: {1}Try to use only the English alphabet [a-z], numbers [0-9],' ' hyphens and underscores for file names. Other characters can cause problems for ' ' different ebook viewers. Are you sure you want to proceed?').format( '<pre>%s</pre>'%newname, '<pre>%s</pre>' % urlnormalize(newname)), 'confirm-urlunsafe-change', parent=self.gui, title=_('Are you sure?'), config_set=tprefs): return self.add_savepoint(_('Rename %s') % oldname) self.gui.blocking_job( 'rename_file', _('Renaming and updating links...'), partial(self.rename_done, oldname, newname), rename_files, current_container(), {oldname: newname})
def __call__(self, oeb, context): has_toc = getattr(getattr(oeb, "toc", False), "nodes", False) if "toc" in oeb.guide: # Ensure toc pointed to in <guide> is in spine from calibre.ebooks.oeb.base import urlnormalize href = urlnormalize(oeb.guide["toc"].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if hasattr(item.data, "xpath") and XPath("//h:a[@href]")(item.data): if oeb.spine.index(item) < 0: if self.position == "end": oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) return elif has_toc: oeb.guide.remove("toc") else: oeb.guide.remove("toc") if not has_toc: return oeb.logger.info("Generating in-line TOC...") title = self.title or oeb.translate(DEFAULT_TITLE) style = self.style if style not in STYLE_CSS: oeb.logger.error("Unknown TOC style %r" % style) style = "nested" id, css_href = oeb.manifest.generate("tocstyle", "tocstyle.css") oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) language = str(oeb.metadata.language[0]) contents = element(None, XHTML("html"), nsmap={None: XHTML_NS}, attrib={XML("lang"): language}) head = element(contents, XHTML("head")) htitle = element(head, XHTML("title")) htitle.text = title element(head, XHTML("link"), rel="stylesheet", type=CSS_MIME, href=css_href) body = element(contents, XHTML("body"), attrib={"class": "calibre_toc"}) h1 = element(body, XHTML("h1"), attrib={"class": "calibre_toc_header"}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate("contents", "contents.xhtml") item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) if self.position == "end": oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) oeb.guide.add("toc", "Table of Contents", href)
def replace_internal_links_with_placeholders(self): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): count += 1 ref = item.abshref(a.get('href')) href, _, frag = ref.partition('#') href = urlnormalize(href) if href in hrefs: placeholder = 'kindle:pos:fid:0000:off:%s'%to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder)
def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc: return self.log('\tGenerating in-line ToC') root = etree.fromstring( TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title)) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def rewrite_links(self, url): href, frag = urldefrag(url) try: href = self.current_item.abshref(href) except ValueError: # Unparseable URL return url href = urlnormalize(href) if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: nhref = '#'.join((urlunquote(nhref), frag)) return nhref return url
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = base64.encodestring(str(linkee)) data = "data:%s;base64,%s" % (linkee.media_type, data) elem.attrib[XLINK('href')] = data return svg
def remove_images(self, item, limit=1): path = XPath('//h:img[@src]') removed = 0 for img in path(item.data): if removed >= limit: break href = item.abshref(img.get('src')) image = self.oeb.manifest.hrefs.get(href) if image is None: href = urlnormalize(href) image = self.oeb.manifest.hrefs.get(href) if image is not None: self.oeb.manifest.remove(image) self.oeb.guide.remove_by_href(href) img.getparent().remove(img) removed += 1 return removed
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get(href, href)) if frag: replacement += '#' + frag return replacement
def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc: return self.log('\tGenerating in-line ToC') root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title)) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn(u'Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get( href, href)) if frag: replacement += '#' + frag return replacement
def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[XLINK('href')]) path = urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = str(linkee) ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[XLINK('href')] = pt.name return svg
def replace_internal_links_with_placeholders(self): self.link_map = {} count = 0 hrefs = {item.href for item in self.oeb.spine} for item in self.oeb.spine: root = self.data(item) for a in XPath('//h:a[@href]')(root): count += 1 ref = item.abshref(a.get('href')) href, _, frag = ref.partition('#') try: href = urlnormalize(href) except ValueError: # a non utf-8 quoted url? Since we cannot interpret it, pass it through. pass if href in hrefs: placeholder = 'kindle:pos:fid:0000:off:%s' % to_href(count) self.link_map[placeholder] = (href, frag) a.set('href', placeholder)
def serialize_toc_level(tocref, href=None): # add the provided toc level to the output stream # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html) if href is not None: # resolve the section url in id_offsets buf.write('<mbp:pagebreak />') self.id_offsets[urlnormalize(href)] = buf.tell() if tocref.klass == "periodical": buf.write('<div> <div height="1em"></div>') else: t = tocref.title #modify by arroz, get rid of if isinstance(t, unicode): t = t.encode('utf-8') buf.write( '<div></div> <div> <h2 height="1em"><font size="+2"><b>') buf.write(t) buf.write('</b></font></h2> <div height="1em"></div>') buf.write('<ul>') for tocitem in tocref.nodes: buf.write('<li><a filepos=') itemhref = tocitem.href if tocref.klass == 'periodical': # This is a section node. # For periodical tocs, the section urls are like r'feed_\d+/index.html' # We dont want to point to the start of the first article # so we change the href. itemhref = re.sub(r'article_\d+/', '', itemhref) self.href_offsets[itemhref].append(buf.tell()) buf.write('0000000000') buf.write(' ><font size="+1"><b><u>') t = tocitem.title if isinstance(t, unicode): t = t.encode('utf-8') buf.write(t) buf.write('</u></b></font></a></li>') buf.write('</ul><div height="1em"></div></div><mbp:pagebreak />')
def _toc_from_tour(self, opf): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urldefrag(urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True