def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc or getattr(opts, 'mobi_passthrough', False)): return self.log('\tGenerating in-line ToC') embed_css = '' s = getattr(oeb, 'store_embed_font_rules', None) if getattr(s, 'body_font_family', None): css = [x.cssText for x in s.rules] + [ 'body { font-family: %s }'%s.body_font_family] embed_css = '\n\n'.join(css) root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or ''))) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def __init__(self, oeb, opts): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc: return self.log('\tGenerating in-line ToC') root = etree.fromstring(TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title)) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, item, linear=True) else: oeb.spine.add(item, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def item_at_top(elem): try: body = XPath('//h:body')(elem.getroottree().getroot())[0] except (TypeError, IndexError, KeyError, AttributeError): return False tree = body.getroottree() path = tree.getpath(elem) for el in body.iterdescendants(etree.Element): epath = tree.getpath(el) if epath == path: break try: if el.tag.endswith('}img') or (el.text and el.text.strip()): return False except: return False if not path.startswith(epath): # Only check tail of non-parent elements if el.tail and el.tail.strip(): return False return True
def frag_is_at_top(root, frag): body = XPath('//h:body')(root) if body: body = body[0] else: return False tree = body.getroottree() elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) if elem: elem = elem[0] else: return False path = tree.getpath(elem) for el in body.iterdescendants(): epath = tree.getpath(el) if epath == path: break if el.text and el.text.strip(): return False if not path.startswith(epath): # Only check tail of non-parent elements if el.tail and el.tail.strip(): return False return True
class OEBReader(object): """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') Container = DirContainer """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' """Default renderer profile for content read with this Reader.""" TRANSFORMS = [] """List of transforms to apply to content read with this Reader.""" @classmethod def config(cls, cfg): """Add any book-reading options to the :class:`Config` object :param:`cfg`. """ return @classmethod def generate(cls, opts): """Generate a Reader instance from command-line options.""" return cls() def __call__(self, oeb, path): """Read the book at :param:`path` into the :class:`OEBBook` object :param:`oeb`. """ self.oeb = oeb self.logger = self.log = oeb.logger oeb.container = self.Container(path, self.logger) oeb.container.log = oeb.log opf = self._read_opf() self._all_from_opf(opf) return oeb def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): elem.tag = OPF(barename(elem.tag)) nsmap.update(OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(OPF('package'), nsmap={None: OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) ignored = (OPF('dc-metadata'), OPF('x-metadata')) for elem in xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if namespace(elem.tag) in DC_NSES: tag = barename(elem.tag).lower() elem.tag = '{%s}%s' % (DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (DC11_NS, tag) metadata.append(elem) for element in xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in xpath(opf, tag): nroot.append(element) return nroot def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) data = XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', OPF1_NS, data) try: opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is)<tours>.+</tours>', '', data) data = data.replace('<dc-metadata>', '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">') try: opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') except etree.XMLSyntaxError: from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER opf = etree.fromstring(data, parser=RECOVER_PARSER) self.logger.warn('OPF contains invalid markup, trying to parse it anyway') ns = namespace(opf.tag) if ns not in ('', OPF1_NS, OPF2_NS): raise OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf def _metadata_from_opf(self, opf): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata stream = cStringIO.StringIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8')) o = OPF(stream) pwm = o.primary_writing_mode if pwm: self.oeb.metadata.primary_writing_mode = pwm mi = o.to_book_metadata() if not mi.language: mi.language = get_lang().replace('_', '-') self.oeb.metadata.add('language', mi.language) if not mi.book_producer: mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\ dict(a=__appname__, v=__version__) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) m = self.oeb.metadata m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') self.oeb.uid = self.oeb.metadata.identifier[-1] if not m.title: m.add('title', self.oeb.translate(__('Unknown'))) has_aut = False for x in m.creator: if getattr(x, 'role', '').lower() in ('', 'aut'): has_aut = True break if not has_aut: m.add('creator', self.oeb.translate(__('Unknown')), role='aut') def _manifest_prune_invalid(self): ''' Remove items from manifest that contain invalid data. This prevents catastrophic conversion failure, when a few files contain corrupted data. ''' bad = [] check = OEB_DOCS.union(OEB_STYLES) for item in list(self.oeb.manifest.values()): if item.media_type in check: try: item.data except KeyboardInterrupt: raise except: self.logger.exception('Failed to parse content in %s'% item.href) bad.append(item) self.oeb.manifest.remove(item) return bad def _manifest_add_missing(self, invalid): import css_parser manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception(u'Failed to read from manifest ' u'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(css_parser.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set([]) for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item) def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = guess_type(href)[0] media_type = guessed or media_type or BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn(u'Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn(u'Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn(u'Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid) def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn( 'Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False) def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn(u'Spine item %r not found' % idref) continue item = manifest.ids[idref] if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'): spine.add(item, elem.get('linear')) else: if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'): item.media_type = XHTML_MIME spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.'%item.href) if len(spine) == 0: raise OEBError("Spine is empty") self._spine_add_extra() for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = urlnormalize(urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn(u'Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href) def _find_ncx(self, opf): result = xpath(opf, '/o2:package/o2:spine/@toc') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == NCX_MIME: self.oeb.manifest.remove(item) return item return None def _toc_from_navpoint(self, item, toc, navpoint): children = xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) href = xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): # This node is useless continue href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' path, _ = urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except: po = self.oeb.toc.next_play_order() authorElement = xpath(child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = xpath(child, 'descendant::calibre:meta[@name = "description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding=unicode_type).strip() if not description: description = None else: description = None index_image = xpath(child, 'descendant::calibre:meta[@name = "toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child) def _toc_from_ncx(self, item): if (item is None) or (item.data is None): return False self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) title = title or unicode_type(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title navmaps = xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True def _toc_from_tour(self, opf): result = xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urldefrag(urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = defaultdict(list) order = [] for anchor in xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(urlnormalize(href)) path, frag = urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = xml2text(anchor) title = COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True def _toc_from_spine(self, opf): self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) return True def _toc_from_opf(self, opf, item): self.oeb.auto_generated_toc = False if self._toc_from_ncx(item): return # Prefer HTML to tour based TOC, since several LIT files # have good HTML TOCs but bad tour based TOCs if self._toc_from_html(opf): return if self._toc_from_tour(opf): return self._toc_from_spine(opf) self.oeb.auto_generated_toc = True def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = COLLAPSE_RE.sub(' ', name.strip()) href = xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True def _find_page_map(self, opf): result = xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True def _pages_from_opf(self, opf, item): if self._pages_from_ncx(opf, item): return if self._pages_from_page_map(opf): return return def _cover_from_html(self, hcover): from calibre.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, urlunquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = '' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) return item def _locate_cover_image(self): if self.oeb.metadata.cover: id = unicode_type(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids.get(id, None) if item is not None and item.media_type in OEB_IMAGES: return item else: self.logger.warn('Invalid cover image @id %r' % id) hcover = self.oeb.spine[0] if 'cover' in self.oeb.guide: href = self.oeb.guide['cover'].href item = self.oeb.manifest.hrefs[href] media_type = item.media_type if media_type in OEB_IMAGES: return item elif media_type in OEB_DOCS: hcover = item html = hcover.data if MS_COVER_TYPE in self.oeb.guide: href = self.oeb.guide[MS_COVER_TYPE].href item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in OEB_IMAGES: return item if self.COVER_SVG_XP(html): svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) href = os.path.splitext(hcover.href)[0] + '.svg' id, href = self.oeb.manifest.generate(hcover.id, href) item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) return item if self.COVER_OBJECT_XP(html): object = self.COVER_OBJECT_XP(html)[0] href = hcover.abshref(object.get('data')) item = self.oeb.manifest.hrefs.get(href, None) if item is not None and item.media_type in OEB_IMAGES: return item return self._cover_from_html(hcover) def _ensure_cover_image(self): cover = self._locate_cover_image() if self.oeb.metadata.cover: self.oeb.metadata.cover[0].value = cover.id return self.oeb.metadata.add('cover', cover.id) def _manifest_remove_duplicates(self): seen = set() dups = set() for item in self.oeb.manifest: if item.href in seen: dups.add(item.href) seen.add(item.href) for href in dups: items = [x for x in self.oeb.manifest if x.href == href] for x in items: if x not in self.oeb.spine: self.oeb.log.warn('Removing duplicate manifest item with id:', x.id) self.oeb.manifest.remove_duplicate_item(x) def _all_from_opf(self, opf): self.oeb.version = opf.get('version', '1.2') self._metadata_from_opf(opf) self._manifest_from_opf(opf) self._spine_from_opf(opf) self._manifest_remove_duplicates() self._guide_from_opf(opf) item = self._find_ncx(opf) self._toc_from_opf(opf, item) self._pages_from_opf(opf, item)
def css_data(container, book_locale, result_data, *args): import tinycss from tinycss.css21 import RuleSet, ImportRule def css_rules(file_name, rules, sourceline=0): ans = [] for rule in rules: if isinstance(rule, RuleSet): selector = rule.selector.as_css() ans.append( CSSRule( selector, RuleLocation(file_name, sourceline + rule.line, rule.column))) elif isinstance(rule, ImportRule): import_name = safe_href_to_name(container, rule.uri, file_name) if import_name and container.exists(import_name): ans.append(import_name) elif getattr(rule, 'rules', False): ans.extend(css_rules(file_name, rule.rules, sourceline)) return ans parser = tinycss.make_full_parser() importable_sheets = {} html_sheets = {} spine_names = {name for name, is_linear in container.spine_names} style_path, link_path = XPath('//h:style'), XPath('//h:link/@href') for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES: importable_sheets[name] = css_rules( name, parser.parse_stylesheet(container.raw_data(name)).rules) elif mt in OEB_DOCS and name in spine_names: html_sheets[name] = [] for style in style_path(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: html_sheets[name].append( css_rules( name, parser.parse_stylesheet( force_unicode(style.text, 'utf-8')).rules, style.sourceline - 1)) rule_map = defaultdict(lambda: defaultdict(list)) def rules_in_sheet(sheet): for rule in sheet: if isinstance(rule, CSSRule): yield rule else: # @import rule isheet = importable_sheets.get(rule) if isheet is not None: yield from rules_in_sheet(isheet) def sheets_for_html(name, root): for href in link_path(root): tname = safe_href_to_name(container, href, name) sheet = importable_sheets.get(tname) if sheet is not None: yield sheet tt_cache = {} def tag_text(elem): ans = tt_cache.get(elem) if ans is None: tag = elem.tag.rpartition('}')[-1] if elem.attrib: attribs = ' '.join('{}="{}"'.format( k, prepare_string_for_xml(elem.get(k, ''), True)) for k in elem.keys()) return f'<{tag} {attribs}>' ans = tt_cache[elem] = '<%s>' % tag def matches_for_selector(selector, select, class_map, rule): lsel = selector.lower() try: matches = tuple(select(selector)) except SelectorError: return () seen = set() def get_elem_and_ancestors(elem): p = elem while p is not None: if p not in seen: yield p seen.add(p) p = p.getparent() for e in matches: for elem in get_elem_and_ancestors(e): for cls in elem.get('class', '').split(): if '.' + cls.lower() in lsel: class_map[cls][elem].append(rule) return (MatchLocation(tag_text(elem), elem.sourceline) for elem in matches) class_map = defaultdict(lambda: defaultdict(list)) for name, inline_sheets in iteritems(html_sheets): root = container.parsed(name) cmap = defaultdict(lambda: defaultdict(list)) for elem in root.xpath('//*[@class]'): for cls in elem.get('class', '').split(): cmap[cls][elem] = [] select = Select(root, ignore_inappropriate_pseudo_classes=True) for sheet in chain(sheets_for_html(name, root), inline_sheets): for rule in rules_in_sheet(sheet): rule_map[rule][name].extend( matches_for_selector(rule.selector, select, cmap, rule)) for cls, elem_map in iteritems(cmap): class_elements = class_map[cls][name] for elem, usage in iteritems(elem_map): class_elements.append( ClassElement(name, elem.sourceline, elem.get('class'), tag_text(elem), tuple(usage))) result_data['classes'] = ans = [] for cls, name_map in iteritems(class_map): la = tuple( ClassFileMatch(name, tuple(class_elements), numeric_sort_key(name)) for name, class_elements in iteritems(name_map) if class_elements) num_of_matches = sum( sum(len(ce.matched_rules) for ce in cfm.class_elements) for cfm in la) ans.append(ClassEntry(cls, num_of_matches, la, numeric_sort_key(cls))) ans = [] for rule, loc_map in iteritems(rule_map): la = tuple( CSSFileMatch(name, tuple(locations), numeric_sort_key(name)) for name, locations in iteritems(loc_map) if locations) count = sum(len(fm.locations) for fm in la) ans.append(CSSEntry(rule, count, la, numeric_sort_key(rule.selector))) return ans
def extract_css_into_flows(self): inlines = defaultdict(list) # Ensure identical <style>s not repeated sheets = {} for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if not self.opts.expand_css and hasattr(item.data, 'cssText'): condense_sheet(sheet) sheets[item.href] = len(self.flows) self.flows.append(sheet) def fix_import_rules(sheet): changed = False for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): if rule.href: href = item.abshref(rule.href) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) rule.href = 'kindle:flow:%s?mime=text/css' % idx changed = True return changed for item in self.oeb.spine: root = self.data(item) for link in XPath('//h:link[@href]')(root): href = item.abshref(link.get('href')) idx = sheets.get(href, None) if idx is not None: idx = to_ref(idx) link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for tag in XPath('//h:style')(root): p = tag.getparent() idx = p.index(tag) raw = tag.text if not raw or not raw.strip(): extract(tag) continue sheet = cssutils.parseString(raw, validate=False) if fix_import_rules(sheet): raw = force_unicode(sheet.cssText, 'utf-8') repl = etree.Element(XHTML('link'), type='text/css', rel='stylesheet') repl.tail = '\n' p.insert(idx, repl) extract(tag) inlines[raw].append(repl) for raw, elems in inlines.iteritems(): idx = to_ref(len(self.flows)) self.flows.append(raw) for link in elems: link.set('href', 'kindle:flow:%s?mime=text/css' % idx) for item in self.oeb.manifest: if item.media_type in OEB_STYLES: sheet = self.data(item) if hasattr(sheet, 'cssRules'): fix_import_rules(sheet) for i, sheet in enumerate(tuple(self.flows)): if hasattr(sheet, 'cssText'): self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
def read_inline_toc(self, href, frag): ans = TOC() base_href = '/'.join(href.split('/')[:-1]) with open(href.replace('/', os.sep), 'rb') as f: raw = f.read().decode(self.header.codec) root = parse_html(raw, log=self.log) body = XPath('//h:body')(root) reached = False if body: start = body[0] else: start = None reached = True if frag: elems = XPath('//*[@id="%s"]'%frag)(root) if elems: start = elems[0] def node_depth(elem): ans = 0 parent = elem.getparent() while parent is not None: parent = parent.getparent() ans += 1 return ans # Layer the ToC based on nesting order in the source HTML current_depth = None parent = ans seen = set() links = [] for elem in root.iterdescendants(etree.Element): if reached and elem.tag == XHTML('a') and elem.get('href', False): href = elem.get('href') href, frag = urldefrag(href) href = base_href + '/' + href text = xml2text(elem).strip() if (text, href, frag) in seen: continue seen.add((text, href, frag)) links.append((text, href, frag, node_depth(elem))) elif elem is start: reached = True depths = sorted(set(x[-1] for x in links)) depth_map = {x:i for i, x in enumerate(depths)} for text, href, frag, depth in links: depth = depth_map[depth] if current_depth is None: current_depth = 0 parent.add_item(href, frag, text) elif current_depth == depth: parent.add_item(href, frag, text) elif current_depth < depth: parent = parent[-1] if len(parent) > 0 else parent parent.add_item(href, frag, text) current_depth += 1 else: delta = current_depth - depth while delta > 0 and parent.parent is not None: parent = parent.parent delta -= 1 parent.add_item(href, frag, text) current_depth = depth return ans
def merge_html(container, names, master): p = container.parsed root = p(master) # Ensure master has a <head> head = root.find('h:head', namespaces=XPNSMAP) if head is None: head = root.makeelement(XHTML('head')) container.insert_into_xml(root, head, 0) seen_anchors = all_anchors(root) seen_stylesheets = set(all_stylesheets(container, master)) master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] master_base = os.path.dirname(master) anchor_map = {n: {} for n in names if n != master} for name in names: if name == master: continue # Insert new stylesheets into master for sheet in all_stylesheets(container, name): if sheet not in seen_stylesheets: seen_stylesheets.add(sheet) link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href( sheet, master)) container.insert_into_xml(head, link) # Rebase links if master is in a different directory if os.path.dirname(name) != master_base: container.replace_links(name, LinkRebaser(container, name, master)) root = p(name) children = [] for body in p(name).findall('h:body', namespaces=XPNSMAP): children.append( body.text if body.text and body.text.strip() else '\n\n') children.extend(body) first_child = '' for first_child in children: if not isinstance(first_child, basestring): break if isinstance(first_child, basestring): # body contained only text, no tags first_child = body.makeelement(XHTML('p')) first_child.text, children[0] = children[0], first_child amap = anchor_map[name] remove_name_attributes(root) for elem in root.xpath('//*[@id]'): val = elem.get('id') if not val: continue if val in seen_anchors: nval = unique_anchor(seen_anchors, val) elem.set('id', nval) amap[val] = nval else: seen_anchors.add(val) if 'id' not in first_child.attrib: first_child.set('id', unique_anchor(seen_anchors, 'top')) seen_anchors.add(first_child.get('id')) amap[''] = first_child.get('id') # Fix links that point to local changed anchors for a in XPath('//h:a[starts-with(@href, "#")]')(root): q = a.get('href')[1:] if q in amap: a.set('href', '#' + amap[q]) for child in children: if isinstance(child, basestring): add_text(master_body, child) else: master_body.append(copy.deepcopy(child)) container.remove_item(name, remove_from_guide=False) # Fix all links in the container that point to merged files for fname, media_type in container.mime_map.iteritems(): repl = MergeLinkReplacer(fname, anchor_map, master, container) container.replace_links(fname, repl)
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name_and_is_not_empty(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode_type): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def is_current_jacket(root): return len( XPath('//h:meta[@name="calibre-content" and @content="jacket"]') (root)) > 0
def case_insensitive_element_names(test, parse_function): markup = '<HTML><P> </p>' root = parse_function(markup) err = 'case sensitive parsing, parsed markup:\n' + etree.tostring( root, encoding='unicode') test.assertEqual(len(XPath('//h:p')(root)), 1, err)
def epubify_markup(self, root, log): from calibre.ebooks.oeb.base import XPath, XHTML # Fix empty title tags for t in XPath('//h:title')(root): if not t.text: t.text = u' ' # Fix <p><div> constructs as the asinine epubchecker complains # about them pdiv = XPath('//h:p/h:div') for div in pdiv(root): div.getparent().tag = XHTML('div') # Remove the position:relative as it causes problems with some epub # renderers. Remove display: block on an image inside a div as it is # redundant and prevents text-align:center from working in ADE # Also ensure that the img is contained in its containing div imgpath = XPath('//h:div/h:img[@style]') for img in imgpath(root): div = img.getparent() if len(div) == 1: style = div.attrib.get('style', '') if style and not style.endswith(';'): style = style + ';' style += 'position:static' # Ensures position of containing div is static # Ensure that the img is always contained in its frame div.attrib['style'] = style img.attrib['style'] = 'max-width: 100%; max-height: 100%' # Handle anchored images. The default markup + CSS produced by # odf2xhtml works with WebKit but not with ADE. So we convert the # common cases of left/right/center aligned block images to work on # both webkit and ADE. We detect the case of setting the side margins # to auto and map it to an appropriate text-align directive, which # works in both WebKit and ADE. # https://bugs.launchpad.net/bugs/1063207 # https://bugs.launchpad.net/calibre/+bug/859343 imgpath = XPath('descendant::h:div/h:div/h:img') for img in imgpath(root): div2 = img.getparent() div1 = div2.getparent() if (len(div1), len(div2)) != (1, 1): continue cls = div1.get('class', '') first_rules = filter( None, [self.get_css_for_class(x) for x in cls.split()]) has_align = False for r in first_rules: if r.style.getProperty(u'text-align') is not None: has_align = True ml = mr = None if not has_align: aval = None cls = div2.get(u'class', u'') rules = filter( None, [self.get_css_for_class(x) for x in cls.split()]) for r in rules: ml = r.style.getPropertyCSSValue(u'margin-left') or ml mr = r.style.getPropertyCSSValue(u'margin-right') or mr ml = getattr(ml, 'value', None) mr = getattr(mr, 'value', None) if ml == mr == u'auto': aval = u'center' elif ml == u'auto' and mr != u'auto': aval = 'right' elif ml != u'auto' and mr == u'auto': aval = 'left' if aval is not None: style = div1.attrib.get('style', '').strip() if style and not style.endswith(';'): style = style + ';' style += 'text-align:%s' % aval has_align = True div1.attrib['style'] = style if has_align: # This is needed for ADE, without it the text-align has no # effect style = div2.attrib['style'] div2.attrib['style'] = 'display:inline;' + style
def from_xpaths(container, xpaths): ''' Generate a Table of Contents from a list of XPath expressions. Each expression in the list corresponds to a level of the generate ToC. For example: :code:`['//h:h1', '//h:h2', '//h:h3']` will generate a three level Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags. ''' tocroot = TOC() xpaths = [XPath(xp) for xp in xpaths] # Find those levels that have no elements in all spine items maps = OrderedDict() empty_levels = {i+1 for i, xp in enumerate(xpaths)} for spinepath in container.spine_items: name = container.abspath_to_name(spinepath) root = container.parsed(name) level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)} for lvl, elems in level_item_map.items(): if elems: empty_levels.discard(lvl) # Remove empty levels from all level_maps if empty_levels: for name, lmap in tuple(maps.items()): lmap = {lvl:items for lvl, items in lmap.items() if lvl not in empty_levels} lmap = sorted(iter(lmap.items()), key=itemgetter(0)) lmap = {i+1:items for i, (l, items) in enumerate(lmap)} maps[name] = lmap node_level_map = {tocroot: 0} def parent_for_level(child_level): limit = child_level - 1 def process_node(node): child = node.last_child if child is None: return node lvl = node_level_map[child] return node if lvl > limit else child if lvl == limit else process_node(child) return process_node(tocroot) for name, level_item_map in maps.items(): root = container.parsed(name) item_level_map = {e:i for i, elems in level_item_map.items() for e in elems} item_dirtied = False all_ids = set(root.xpath('//*/@id')) for item in root.iterdescendants(etree.Element): lvl = item_level_map.get(item, None) if lvl is None: continue text = elem_to_toc_text(item) parent = parent_for_level(lvl) if item_at_top(item): dirtied, elem_id = False, None else: dirtied, elem_id = ensure_id(item, all_ids) item_dirtied = dirtied or item_dirtied toc = parent.add(text, name, elem_id) node_level_map[toc] = lvl toc.dest_exists = True if item_dirtied: container.commit_item(name, keep_parsed=True) return tocroot
def validate_xpath_selector(val): try: XPath(val) except Exception: return _('{} is not a valid XPath selector').format(val)
def __init__(self, oeb, opts, replace_previous_inline_toc=False, ignore_existing_toc=False): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 self.tocitem = tocitem = None if find_previous_calibre_inline_toc: tocitem = self.tocitem = find_previous_calibre_inline_toc(oeb) if ignore_existing_toc and "toc" in oeb.guide: oeb.guide.remove("toc") if "toc" in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide["toc"].href.partition("#")[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if hasattr(item.data, "xpath") and XPath("//h:a[@href]")(item.data): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove("toc") else: oeb.guide.remove("toc") if not self.has_toc or "toc" in oeb.guide or opts.no_inline_toc or getattr(opts, "mobi_passthrough", False): return self.log("\tGenerating in-line ToC") embed_css = "" s = getattr(oeb, "store_embed_font_rules", None) if getattr(s, "body_font_family", None): css = [x.cssText for x in s.rules] + ["body { font-family: %s }" % s.body_font_family] embed_css = "\n\n".join(css) root = etree.fromstring( TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or "")) ) parent = XPath("//h:ul")(root)[0] parent.text = "\n\t" for child in self.oeb.toc: self.process_toc_node(child, parent) if tocitem is not None: href = tocitem.href if oeb.spine.index(tocitem) > -1: oeb.spine.remove(tocitem) tocitem.data = root else: id, href = oeb.manifest.generate("contents", "contents.xhtml") tocitem = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, tocitem, linear=True) else: oeb.spine.add(tocitem, linear=False) oeb.guide.add("toc", "Table of Contents", href)
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id): if not isinstance(node.tag, basestring) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, (int, float)): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if node.tag == XHTML('font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and len(node.text) == 1) # Detect drop caps generated by the docx input plugin if (node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and not node.tail and 'line-height' in cssdict and 'font-size' in cssdict): dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = dynamic_rescale_factor(node) if dyn_rescale is not None: fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. if not is_drop_cap and style['line-height'] < minlh * fsize: cssdict['line-height'] = str(minlh) except: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: cssdict.pop(x, None) if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(cssdict.items()) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' klass = ascii_text( STRIPNUM.sub('', classes.split()[0].replace('_', ''))) if css in styles: match = styles[css] else: match = klass + str(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in pseudo_classes.iteritems(): items = sorted(cssdict.iteritems()) css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + str(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def __init__(self): self.html_tags = XPath('descendant::h:*')
def __call__(self, oeb, context): bx = XPath('//h:body') for x in oeb.manifest.items: if x.media_type in OEB_DOCS: for body in bx(x.data): self.unsmarten(body)
def attribute_replacement(test, parse_function): markup = '<html><body><svg viewbox="0"></svg><svg xmlns="%s" viewbox="1">' % SVG_NS root = parse_function(markup) err = 'SVG attributes not normalized, parsed markup:\n' + etree.tostring( root, encoding='unicode') test.assertEqual(len(XPath('//svg:svg[@viewBox]')(root)), 2, err)
def __init__(self, oeb, opts, replace_previous_inline_toc=False, ignore_existing_toc=False): self.oeb, self.opts, self.log = oeb, opts, oeb.log self.title = opts.toc_title or DEFAULT_TITLE self.at_start = opts.mobi_toc_at_start self.generated_item = None self.added_toc_guide_entry = False self.has_toc = oeb.toc and oeb.toc.count() > 1 self.tocitem = tocitem = None if find_previous_calibre_inline_toc: tocitem = self.tocitem = find_previous_calibre_inline_toc(oeb) if ignore_existing_toc and 'toc' in oeb.guide: oeb.guide.remove('toc') if 'toc' in oeb.guide: # Remove spurious toc entry from guide if it is not in spine or it # does not have any hyperlinks href = urlnormalize(oeb.guide['toc'].href.partition('#')[0]) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: oeb.spine.add(item, linear=False) return elif self.has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if (not self.has_toc or 'toc' in oeb.guide or opts.no_inline_toc or getattr(opts, 'mobi_passthrough', False)): return self.log('\tGenerating in-line ToC') embed_css = '' s = getattr(oeb, 'store_embed_font_rules', None) if getattr(s, 'body_font_family', None): css = [x.cssText for x in s.rules ] + ['body { font-family: %s }' % s.body_font_family] embed_css = '\n\n'.join(css) root = etree.fromstring( TEMPLATE.format(xhtmlns=XHTML_NS, title=self.title, embed_css=embed_css, extra_css=(opts.extra_css or ''))) parent = XPath('//h:ul')(root)[0] parent.text = '\n\t' for child in self.oeb.toc: self.process_toc_node(child, parent) if tocitem is not None: href = tocitem.href if oeb.spine.index(tocitem) > -1: oeb.spine.remove(tocitem) tocitem.data = root else: id, href = oeb.manifest.generate('contents', 'contents.xhtml') tocitem = self.generated_item = oeb.manifest.add(id, href, XHTML_MIME, data=root) if self.at_start: oeb.spine.insert(0, tocitem, linear=True) else: oeb.spine.add(tocitem, linear=False) oeb.guide.add('toc', 'Table of Contents', href)
def comments(test, parse_function): markup = '<html><!-- -- ---><body/></html>' root = parse_function(markup) test.assertEqual(len(XPath('//h:body')(root)), 1, 'Failed to parse with comment containing dashes') test.assertEqual(len(tuple(root.iterdescendants(etree.Comment))), 1)
def all_stylesheets(container, name): for link in XPath('//h:head/h:link[@href]')(container.parsed(name)): name = container.href_to_name(link.get('href'), name) typ = link.get('type', 'text/css') if typ == 'text/css': yield name
def namespaces(test, parse_function): ae = test.assertEqual def match_and_prefix(root, xpath, prefix, err=''): matches = XPath(xpath)(root) ae(len(matches), 1, err) ae(matches[0].prefix, prefix, err) markup = ''' <html xmlns="{xhtml}"><head><body id="test"></html> '''.format( xhtml=XHTML_NS) root = parse_function(markup) ae( len(XPath('//h:body[@id="test"]')(root)), 1, 'Incorrect parsing, parsed markup:\n' + etree.tostring(root, encoding='unicode')) match_and_prefix(root, '//h:body[@id="test"]', None) markup = ''' <html xmlns="{xhtml}"><head><body id="test"> <svg:svg xmlns:svg="{svg}"><svg:image xmlns:xlink="{xlink}" xlink:href="xxx"/></svg:svg> '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) root = parse_function(markup) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring( root, encoding='unicode') match_and_prefix(root, '//h:body[@id="test"]', None, err) match_and_prefix(root, '//svg:svg', 'svg', err) match_and_prefix(root, '//svg:image[@xl:href]', 'svg', err) markup = ''' <html xmlns="{xhtml}"><head><body id="test"> <svg xmlns="{svg}" xmlns:xlink="{xlink}" ><image xlink:href="xxx"/></svg> '''.format(xhtml=XHTML_NS, svg=SVG_NS, xlink=XLINK_NS) root = parse_function(markup) err = 'Incorrect parsing, parsed markup:\n' + etree.tostring( root, encoding='unicode') match_and_prefix(root, '//h:body[@id="test"]', None, err) match_and_prefix(root, '//svg:svg', None, err) match_and_prefix(root, '//svg:image[@xl:href]', None, err) markup = '<html><body><svg><image xlink:href="xxx"></svg>' root = parse_function(markup) err = 'Namespaces not created, parsed markup:\n' + etree.tostring( root, encoding='unicode') match_and_prefix(root, '//svg:svg', None, err) match_and_prefix(root, '//svg:image[@xl:href]', None, err) if parse_function is parse: image = XPath('//svg:image')(root)[0] ae(image.nsmap, {'xlink': XLINK_NS, None: SVG_NS}) root = parse_function('<html id="a"><p><html xmlns:x="y" lang="en"><p>') err = 'Multiple HTML tags not handled, parsed markup:\n' + etree.tostring( root, encoding='unicode') match_and_prefix(root, '//h:html', None, err) match_and_prefix(root, '//h:html[@lang]', None, err) match_and_prefix(root, '//h:html[@id]', None, err) # if parse_function is not html5_parse: # markup = '<html:html xmlns:html="{html}" id="a"><html:body><html:p></html:p></html:body></html>'.format(html=XHTML_NS) # root = parse_function(markup) # err = 'HTML namespace prefixed, parsed markup:\n' + etree.tostring(root, encoding='unicode') # match_and_prefix(root, '//h:html', None, err) markup = '<html><body><ns1:tag1 xmlns:ns1="NS"><ns2:tag2 xmlns:ns2="NS" ns1:id="test"/><ns1:tag3 xmlns:ns1="NS2" ns1:id="test"/></ns1:tag1>' root = parse_function(markup) err = 'Arbitrary namespaces not preserved, parsed markup:\n' + etree.tostring( root, encoding='unicode') def xpath(expr): return etree.XPath(expr, namespaces={'ns1': 'NS', 'ns2': 'NS2'})(root) ae(len(xpath('//ns1:tag1')), 1, err) ae(len(xpath('//ns1:tag2')), 1, err) ae(len(xpath('//ns2:tag3')), 1, err) ae(len(xpath('//ns1:tag2[@ns1:id="test"]')), 1, err) ae(len(xpath('//ns2:tag3[@ns2:id="test"]')), 1, err) # for tag in root.iter(): # if 'NS' in tag.tag: # ae('ns1', tag.prefix) markup = '<html xml:lang="en"><body><p lang="de"><p xml:lang="es"><p lang="en" xml:lang="de">' root = parse_function(markup) err = 'xml:lang not converted to lang, parsed markup:\n' + etree.tostring( root, encoding='unicode') ae(len(root.xpath('//*[@lang="en"]')), 2, err) ae(len(root.xpath('//*[@lang="de"]')), 1, err) ae(len(root.xpath('//*[@lang="es"]')), 1, err)
def do_split(split_point, log, before=True): ''' Split tree into a *before* and an *after* tree at ``split_point``. :param split_point: The Element at which to split :param before: If True tree is split before split_point, otherwise after split_point :return: before_tree, after_tree ''' if before: # We cannot adjust for after since moving an after split point to a # parent will cause breakage if the parent contains any content # after the original split point split_point = adjust_split_point(split_point, log) tree = split_point.getroottree() path = tree.getpath(split_point) tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) root, root2 = tree.getroot(), tree2.getroot() body, body2 = map(get_body, (root, root2)) split_point = root.xpath(path)[0] split_point2 = root2.xpath(path)[0] def nix_element(elem, top=True): # Remove elem unless top is False in which case replace elem by its # children parent = elem.getparent() if top: parent.remove(elem) else: index = parent.index(elem) parent[index:index + 1] = list(elem.iterchildren()) # Tree 1 hit_split_point = False keep_descendants = False split_point_descendants = frozenset(split_point.iterdescendants()) for elem in tuple(body.iterdescendants()): if elem is split_point: hit_split_point = True if before: nix_element(elem) else: # We want to keep the descendants of the split point in # Tree 1 keep_descendants = True # We want the split point element, but not its tail elem.tail = '\n' continue if hit_split_point: if keep_descendants: if elem in split_point_descendants: # elem is a descendant keep it continue else: # We are out of split_point, so prevent further set # lookups of split_point_descendants keep_descendants = False nix_element(elem) # Tree 2 ancestors = frozenset(XPath('ancestor::*')(split_point2)) for elem in tuple(body2.iterdescendants()): if elem is split_point2: if not before: # Keep the split point element's tail, if it contains non-whitespace # text tail = elem.tail if tail and not tail.isspace(): parent = elem.getparent() idx = parent.index(elem) if idx == 0: parent.text = (parent.text or '') + tail else: sib = parent[idx - 1] sib.tail = (sib.tail or '') + tail # Remove the element itself nix_element(elem) break if elem in ancestors: # We have to preserve the ancestors as they could have CSS # styles that are inherited/applicable, like font or # width. So we only remove the text, if any. elem.text = '\n' else: nix_element(elem, top=False) body2.text = '\n' return tree, tree2
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): if not isinstance(node.tag, string_or_bytes) \ or namespace(node.tag) != XHTML_NS: return tag = barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: font_size = style['font-size'] except: font_size = self.sbase if self.sbase is not None else \ self.context.source.fbase if tag == 'body' and isinstance(font_size, numbers.Number): stylizer.body_font_size = font_size if 'align' in node.attrib: if tag != 'img': cssdict['text-align'] = node.attrib['align'] if cssdict['text-align'] == 'center': # align=center causes tables to be center aligned, # which text-align does not. And the ever trustworthy Word # uses this construct in its HTML output. See # https://bugs.launchpad.net/bugs/1569583 if tag == 'table': if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict[ 'margin-right'] = 'auto' else: for table in node.iterchildren(XHTML("table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get( 'margin-right') is None: ts.set('margin-left', 'auto') ts.set('margin-right', 'auto') else: val = node.attrib['align'] if val in ('middle', 'bottom', 'top'): cssdict['vertical-align'] = val elif val in ('left', 'right'): cssdict['float'] = val del node.attrib['align'] if 'valign' in node.attrib and tag == 'td': if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] if node.tag == XHTML('font'): tags = [ 'descendant::h:%s' % x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote') ] tag = 'div' if XPath('|'.join(tags))(node) else 'span' node.tag = XHTML(tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) size = node.attrib['size'].strip() if size: fnums = self.context.source.fnums if size[0] in ('+', '-'): # Oh, the warcrimes try: esize = 3 + force_int(size) except: esize = 3 if esize < 1: esize = 1 if esize > 7: esize = 7 font_size = fnums[esize] else: try: font_size = fnums[force_int(size)] except: font_size = fnums[3] cssdict['font-size'] = '%.1fpt' % font_size del node.attrib['size'] if 'face' in node.attrib: cssdict['font-family'] = node.attrib['face'] del node.attrib['face'] if 'color' in node.attrib: try: cssdict['color'] = Property('color', node.attrib['color']).value except (ValueError, SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: cssdict['background-color'] = Property( 'background-color', node.attrib['bgcolor']).value except (ValueError, SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: del node.attrib['type'] if cssdict.get('font-weight', '').lower() == 'medium': cssdict[ 'font-weight'] = 'normal' # ADE chokes on font-weight medium fsize = font_size is_drop_cap = ( cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f))) # Detect drop caps generated by the docx input plugin if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \ not node.tail and 'line-height' in cssdict and 'font-size' in cssdict: dp = node.getparent() if dp.tag and dp.tag.endswith('}div') and len( dp) == 1 and not dp.text: if stylizer.style(dp).cssdict().get('float', None) == 'left': is_drop_cap = True if not self.context.disable_font_rescaling and not is_drop_cap: _sbase = self.sbase if self.sbase is not None else \ self.context.source.fbase dyn_rescale = node.attrib.pop('data-calibre-rescale', None) if dyn_rescale is not None: try: dyn_rescale = float(dyn_rescale) / 100 except Exception: dyn_rescale = 1 fsize = self.fmap[_sbase] fsize *= dyn_rescale cssdict['font-size'] = '%0.5fem' % (fsize / psize) psize = fsize elif 'font-size' in cssdict or tag == 'body': fsize = self.fmap[font_size] try: cssdict['font-size'] = "%0.5fem" % (fsize / psize) except ZeroDivisionError: cssdict['font-size'] = '%.1fpt' % fsize psize = fsize try: minlh = self.context.minimum_line_height / 100. slh = style['line-height'] if not is_drop_cap and isinstance( slh, numbers.Number) and slh < minlh * fsize: cssdict['line-height'] = unicode_type(minlh) except Exception: self.oeb.logger.exception('Failed to set minimum line-height') if cssdict: for x in self.filter_css: popval = cssdict.pop(x, None) if self.body_font_family and popval and x == 'font-family' \ and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]: cssdict[x] = popval if cssdict: if self.lineh and self.fbase and tag != 'body': self.clean_edges(cssdict, style, psize) if 'display' in cssdict and cssdict['display'] == 'in-line': cssdict['display'] = 'inline' if self.unfloat and 'float' in cssdict \ and cssdict.get('display', 'none') != 'none': del cssdict['display'] if self.untable and 'display' in cssdict \ and cssdict['display'].startswith('table'): display = cssdict['display'] if display == 'table-cell': cssdict['display'] = 'inline' else: cssdict['display'] = 'block' if 'vertical-align' in cssdict \ and cssdict['vertical-align'] == 'sup': cssdict['vertical-align'] = 'super' if self.lineh and 'line-height' not in cssdict: lineh = self.lineh / psize cssdict['line-height'] = "%0.5fem" % lineh if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'): if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle': for prop in ('margin', 'padding', 'border'): for edge in ('top', 'bottom'): cssdict['%s-%s' % (prop, edge)] = '0pt' if self.context.insert_blank_line: cssdict['margin-top'] = cssdict['margin-bottom'] = \ '%fem'%self.context.insert_blank_line_size indent_size = self.context.remove_paragraph_spacing_indent_size keep_indents = indent_size < 0.0 if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')): cssdict['text-indent'] = "%1.1fem" % indent_size pseudo_classes = style.pseudo_classes(self.filter_css) if cssdict or pseudo_classes: keep_classes = set() if cssdict: items = sorted(iteritems(cssdict)) css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items) classes = node.get('class', '').strip() or 'calibre' classes_list = classes.split() # lower() because otherwise if the document uses the same class # name with different case, both cases will apply, leading # to incorrect results. klass = ascii_text(STRIPNUM.sub( '', classes_list[0])).lower().strip().replace(' ', '_') if css in styles: match = styles[css] else: match = klass + unicode_type(names[klass] or '') styles[css] = match names[klass] += 1 node.attrib['class'] = match keep_classes.add(match) for psel, cssdict in iteritems(pseudo_classes): items = sorted(iteritems(cssdict)) css = ';\n'.join('%s: %s' % (key, val) for key, val in items) pstyles = pseudo_styles[psel] if css in pstyles: match = pstyles[css] else: # We have to use a different class for each psel as # otherwise you can have incorrect styles for a situation # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green } # If the pcalibre class for a:hover and a:link is the same, # then the class attribute for a.x tags will contain both # that class and the class for a.x:hover, which is wrong. klass = 'pcalibre' match = klass + unicode_type(names[klass] or '') pstyles[css] = match names[klass] += 1 keep_classes.add(match) node.attrib['class'] = ' '.join(keep_classes) elif 'class' in node.attrib: del node.attrib['class'] if 'style' in node.attrib: del node.attrib['style'] if recurse: for child in node: self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
def match_and_prefix(root, xpath, prefix, err=''): matches = XPath(xpath)(root) ae(len(matches), 1, err) ae(matches[0].prefix, prefix, err)
def from_xpaths(container, xpaths): tocroot = TOC() xpaths = [XPath(xp) for xp in xpaths] level_prev = {i + 1: None for i in xrange(len(xpaths))} level_prev[0] = tocroot # Find those levels that have no elements in all spine items maps = OrderedDict() empty_levels = {i + 1 for i, xp in enumerate(xpaths)} for spinepath in container.spine_items: name = container.abspath_to_name(spinepath) root = container.parsed(name) level_item_map = maps[name] = { i + 1: frozenset(xp(root)) for i, xp in enumerate(xpaths) } for lvl, elems in level_item_map.iteritems(): if elems: empty_levels.discard(lvl) # Remove empty levels from all level_maps if empty_levels: for name, lmap in tuple(maps.iteritems()): lmap = { lvl: items for lvl, items in lmap.iteritems() if lvl not in empty_levels } lmap = sorted(lmap.iteritems(), key=itemgetter(0)) lmap = {i + 1: items for i, (l, items) in enumerate(lmap)} maps[name] = lmap for name, level_item_map in maps.iteritems(): root = container.parsed(name) item_level_map = { e: i for i, elems in level_item_map.iteritems() for e in elems } item_dirtied = False for item in root.iterdescendants(etree.Element): lvl = plvl = item_level_map.get(item, None) if lvl is None: continue parent = None while parent is None: plvl -= 1 parent = level_prev[plvl] lvl = plvl + 1 if item_at_top(item): dirtied, elem_id = False, None else: dirtied, elem_id = ensure_id(item) text = elem_to_toc_text(item) item_dirtied = dirtied or item_dirtied toc = parent.add(text, name, elem_id) toc.dest_exists = True level_prev[lvl] = toc for i in xrange(lvl + 1, len(xpaths) + 1): level_prev[i] = None if item_dirtied: container.commit_item(name, keep_parsed=True) return tocroot
def workaround_ade_quirks(self): # {{{ ''' Perform various markup transforms to get the output to render correctly in the quirky ADE. ''' from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote stylesheet = self.oeb.manifest.main_stylesheet # ADE cries big wet tears when it encounters an invalid fragment # identifier in the NCX toc. frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$') for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): base, _, frag = href.partition('#') frag = urlunquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing invalid fragment identifier %r from TOC' % frag) node.href = base for x in self.oeb.spine: root = x.data body = XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove <img> tags with empty src elements bad = [] for x in XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) for img in bad: img.getparent().remove(img) # Add id attribute to <a> tags that have name for x in XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about <a> tags that # have name attributes. x.attrib.pop('name') # Replace <br> that are children of <body> as ADE doesn't handle them for br in XPath('./h:br')(body): if br.getparent() is None: continue try: prior = br.itersiblings(preceding=True).next() priortag = barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() br.tag = XHTML('p') br.text = u'\u00a0' style = br.get('style', '').split(';') style = filter(None, map(lambda x: x.strip(), style)) style.append('margin:0pt; border:0pt') # If the prior tag is a block (including a <br> we replaced) # then this <br> replacement should have a 1-line height. # Otherwise it should have no height. if not priortext and priortag in block_level_tags: style.append('height:1em') else: style.append('height:0pt') br.set('style', '; '.join(style)) for tag in XPath('//h:embed')(root): tag.getparent().remove(tag) for tag in XPath('//h:object')(root): if tag.get('type', '').lower().strip() in { 'image/svg+xml', 'application/svg+xml' }: continue tag.getparent().remove(tag) for tag in XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) for tag in XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) for tag in XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) formchildren = XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') for tag in XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form tag.tag = XHTML('div') for tag in XPath('//h:center')(root): tag.tag = XHTML('div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url for tag in XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a <td> outside a # <table> in_table = XPath('ancestor::h:table') for tag in XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): tag.tag = XHTML('div') special_chars = re.compile(u'[\u200b\u00ad]') for elem in root.iterdescendants(): if getattr(elem, 'text', False): elem.text = special_chars.sub('', elem.text) elem.text = elem.text.replace(u'\u2011', '-') if getattr(elem, 'tail', False): elem.tail = special_chars.sub('', elem.tail) elem.tail = elem.tail.replace(u'\u2011', '-') if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from cssutils.css import CSSRule for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.' + lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType( CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: rule.style.removeProperty('margin-left') # padding-left breaks rendering in webkit and gecko rule.style.removeProperty('padding-left') # Change whitespace:pre to pre-wrap to accommodate readers that # cannot scroll horizontally for rule in stylesheet.data.cssRules.rulesOfType( CSSRule.STYLE_RULE): style = rule.style ws = style.getPropertyValue('white-space') if ws == 'pre': style.setProperty('white-space', 'pre-wrap')
def subset_all_fonts(container, font_stats, report): remove = set() total_old = total_new = 0 for name, mt in container.mime_map.iteritems(): if mt in OEB_FONTS or name.rpartition('.')[-1].lower() in { 'otf', 'ttf' }: chars = font_stats.get(name, set()) path = container.name_path_map[name] total_old += os.path.getsize(path) if not chars: remove.add(name) report('Removed unused font: %s' % name) continue with open(path, 'r+b') as f: raw = f.read() font_name = get_font_names(raw)[-1] warnings = [] container.log('Subsetting font: %s' % (font_name or name)) try: nraw, old_sizes, new_sizes = subset(raw, chars, warnings=warnings) except UnsupportedFont as e: container.log.warning( 'Unsupported font: %s, ignoring. Error: %s' % (name, as_unicode(e))) continue for w in warnings: container.log.warn(w) olen = sum(old_sizes.itervalues()) nlen = sum(new_sizes.itervalues()) total_new += len(nraw) if nlen == olen: report('The font %s was already subset' % font_name) else: report( 'Decreased the font %s to %.1f%% of its original size' % (font_name, nlen / olen * 100)) f.seek(0), f.truncate(), f.write(nraw) for name in remove: container.remove_item(name) if remove: for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: sheet = container.parsed(name) if remove_font_face_rules(container, sheet, remove, name): container.dirty(name) elif mt in OEB_DOCS: for style in XPath('//h:style')(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text, name) if remove_font_face_rules(container, sheet, remove, name): style.text = sheet.cssText container.dirty(name) if total_old > 0: report('Reduced total font size to %.1f%% of original' % (total_new / total_old * 100)) else: report('No embedded fonts found')