def serialize_elem(self, elem, item, nsrmap=NSRMAP): buf = self.buf if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in nsrmap: return tag = base.prefixname(elem.tag, nsrmap) # Previous layers take care of @name id_ = elem.attrib.pop('id', None) if id_: href = '#'.join((item.href, id_)) offset = self.anchor_offset or buf.tell() key = base.urlnormalize(href) # Only set this id_offset if it wasn't previously seen self.id_offsets[key] = self.id_offsets.get(key, offset) if self.anchor_offset is not None and \ tag == 'a' and not elem.attrib and \ not len(elem) and not elem.text: return self.anchor_offset = buf.tell() buf.write(b'<') buf.write(tag.encode('utf-8')) if elem.attrib: for attr, val in elem.attrib.items(): if parse_utils.namespace(attr) not in nsrmap: continue attr = base.prefixname(attr, nsrmap) buf.write(b' ') if attr == 'href': if self.serialize_href(val, item): continue elif attr == 'src': href = base.urlnormalize(item.abshref(val)) if href in self.images: index = self.images[href] self.used_images.add(href) buf.write(b'recindex="%05d"' % index) continue buf.write(attr.encode('utf-8')) buf.write(b'="') self.serialize_text(val, quot=True) buf.write(b'"') buf.write(b'>') if elem.text or len(elem) > 0: if elem.text: self.anchor_offset = None self.serialize_text(elem.text) for child in elem: self.serialize_elem(child, item) if child.tail: self.anchor_offset = None self.serialize_text(child.tail) buf.write(('</%s>' % tag).encode('utf-8'))
def __call__(self, oeb, context): import css_parser oeb.logger.info('Trimming unused files from manifest...') self.opts = context used = set() for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[item.value]) elif item.value in oeb.manifest.ids: used.add(oeb.manifest.ids[item.value]) for ref in oeb.guide.values(): path, _ = urllib.parse.urldefrag(ref.href) if path in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[path]) # TOC items are required to be in the spine for item in oeb.spine: used.add(item) unchecked = used while unchecked: new = set() for item in unchecked: if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) elif item.media_type == CSS_MIME: for href in css_parser.getUrls(item.data): href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) used.update(new) unchecked = new for item in oeb.manifest.values(): if item not in used: oeb.logger.info('Trimming %r from manifest' % item.href) oeb.manifest.remove(item)
def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = base.xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = base.xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not base.xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = collections.defaultdict(list) order = [] for anchor in base.xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(base.urlnormalize(href)) path, frag = urllib.parse.urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = base.xml2text(anchor) title = base.COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def process_fonts(self): ''' Make sure all fonts are embeddable ''' from ebook_converter.ebooks.oeb.base import urlnormalize from ebook_converter.utils.fonts.utils import remove_embed_restriction processed = set() for item in list(self.oeb.manifest): if not hasattr(item.data, 'cssRules'): continue for i, rule in enumerate(item.data.cssRules): if rule.type == rule.FONT_FACE_RULE: try: s = rule.style src = s.getProperty('src').propertyValue[0].uri except: continue path = item.abshref(src) ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None) if ff is None: continue raw = nraw = ff.data if path not in processed: processed.add(path) try: nraw = remove_embed_restriction(raw) except: continue if nraw != raw: ff.data = nraw self.oeb.container.write(path, nraw)
def __call__(self, oeb, opts): import css_parser self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): css_parser.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in self.oeb.guide.values(): href = urlnormalize(ref.href) href, frag = urllib.parse.urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def serialize_href(self, href, _base=None): """ Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. """ hrefs = self.oeb.manifest.hrefs try: path, frag = urllib.parse.urldefrag(base.urlnormalize(href)) except ValueError: # Unparseable URL return False if path and _base: path = _base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False path = item.href if item else _base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) buf.write(b'0000000000') return True
def find_font_face_rules(sheet, oeb): ''' Find all @font-face rules in the given sheet and extract the relevant info from them. sheet can be either a ManifestItem or a CSSStyleSheet. ''' ans = [] try: rules = sheet.data.cssRules except AttributeError: rules = sheet.cssRules for i, rule in enumerate(rules): if rule.type != rule.FONT_FACE_RULE: continue props = get_font_properties(rule, default='normal') if not props['font-family'] or not props['src']: continue try: path = sheet.abshref(props['src']) except AttributeError: path = props['src'] ff = oeb.manifest.hrefs.get(urlnormalize(path), None) if not ff: continue props['item'] = ff if props['font-weight'] in {'bolder', 'lighter'}: props['font-weight'] = '400' props['weight'] = int(props['font-weight']) props['rule'] = rule props['chars'] = set() ans.append(props) return ans
def rewrite_link(self, url, page=None): if not page: return url abs_url = page.abshref(base.urlnormalize(url)) if abs_url in self.images: return 'images/%s' % self.images[abs_url] if abs_url in self.links: return self.links[abs_url] return url
def rasterize_item(self, item): html = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(html, '//h:img[@src]'): src = urlnormalize(elem.attrib['src']) image = hrefs.get(item.abshref(src), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME): data = urlnormalize(elem.attrib['data']) image = hrefs.get(item.abshref(data), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//svg:svg'): style = self.stylizer(item).style(elem) self.rasterize_inline(elem, style, item)
def inspect_cover(self, href): from ebook_converter.ebooks.oeb.base import urlnormalize for x in self.oeb.manifest: if x.href == urlnormalize(href): try: raw = x.data return identify(raw)[1:] except Exception: self.log.exception('Failed to read cover image dimensions') return -1, -1
def __call__(self, oeb, context): has_toc = getattr(getattr(oeb, 'toc', False), 'nodes', False) if 'toc' in oeb.guide: # Ensure toc pointed to in <guide> is in spine from ebook_converter.ebooks.oeb.base import urlnormalize href = urlnormalize(oeb.guide['toc'].href) if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and base.XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) return elif has_toc: oeb.guide.remove('toc') else: oeb.guide.remove('toc') if not has_toc: return oeb.logger.info('Generating in-line TOC...') title = self.title or oeb.translate(DEFAULT_TITLE) style = self.style if style not in STYLE_CSS: oeb.logger.error('Unknown TOC style %r', style) style = 'nested' id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style]) language = str(oeb.metadata.language[0]) contents = base.element(None, base.tag('xhtml', 'html'), nsmap={None: const.XHTML_NS}, attrib={base.tag('xml', 'lang'): language}) head = base.element(contents, base.tag('xhtml', 'head')) htitle = base.element(head, base.tag('xhtml', 'title')) htitle.text = title base.element(head, base.tag('xhtml', 'link'), rel='stylesheet', type=base.CSS_MIME, href=css_href) body = base.element(contents, base.tag('xhtml', 'body'), attrib={'class': 'calibre_toc'}) h1 = base.element(body, base.tag('xhtml', 'h2'), attrib={'class': 'calibre_toc_header'}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate('contents', 'contents.xhtml') item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents) if self.position == 'end': oeb.spine.add(item, linear=False) else: oeb.spine.insert(0, item, linear=True) oeb.guide.add('toc', 'Table of Contents', href)
def fix_toc_entry(self, toc): if toc.href: href = urlnormalize(toc.href) href, frag = urllib.parse.urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref = '#'.join((nhref, frag)) toc.href = nhref for x in toc: self.fix_toc_entry(x)
def remove_images(self, item, limit=1): path = XPath('//h:img[@src]') removed = 0 for img in path(item.data): if removed >= limit: break href = item.abshref(img.get('src')) image = self.oeb.manifest.hrefs.get(href) if image is None: href = urlnormalize(href) image = self.oeb.manifest.hrefs.get(href) if image is not None: self.oeb.manifest.remove(image) self.oeb.guide.remove_by_href(href) img.getparent().remove(img) removed += 1 return removed
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn('Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def url_replacer(self, orig_url): url = urlnormalize(orig_url) parts = urllib.parse.urlparse(url) if parts.scheme: # Only rewrite local URLs return orig_url path, frag = urllib.parse.urldefrag(url) if self.renamed_items_map: orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) else: orig_item = self.current_item href = orig_item.abshref(path) replacement = self.current_item.relhref(self.rename_map.get( href, href)) if frag: replacement += '#' + frag return replacement
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = linkee.bytes_representation ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[base.tag('xlink', 'href')] = pt.name return svg
def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in base.xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = base.COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(base.urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True
def _toc_from_tour(self, opf): result = base.xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = base.xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urllib.parse.urldefrag(base.urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True
def rewrite_links(self, url): href, frag = urllib.parse.urldefrag(url) try: href = self.current_item.abshref(href) except ValueError: # Unparseable URL return url try: href = base.urlnormalize(href) except ValueError: # href has non utf-8 quoting return url if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: nhref = '#'.join((polyglot.unquote(nhref), frag)) return nhref return url
def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = base.COLLAPSE_RE.sub(' ', name.strip()) href = base.xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(base.urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True
def _spine_add_extra(self): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) selector = base.XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: if item.media_type not in base.OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): href, _ = urllib.parse.urldefrag(href) if not href: continue try: href = item.abshref(base.urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] if found.media_type not in base.OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue if version >= 2: self.logger.warn('Spine-referenced file %r not in spine' % item.href) spine.add(item, linear=False)
def serialize_toc_level(tocref, href=None): # add the provided toc level to the output stream # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html) if href is not None: # resolve the section url in id_offsets buf.write(b'<mbp:pagebreak />') self.id_offsets[base.urlnormalize(href)] = buf.tell() if tocref.klass == "periodical": buf.write(b'<div> <div height="1em"></div>') else: t = tocref.title if isinstance(t, str): t = t.encode('utf-8') buf.write( b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t + b'</b></font></h2> <div height="1em"></div>') buf.write(b'<ul>') for tocitem in tocref.nodes: buf.write(b'<li><a filepos=') itemhref = tocitem.href if tocref.klass == 'periodical': # This is a section node. # For periodical tocs, the section urls are like r'feed_\d+/index.html' # We dont want to point to the start of the first article # so we change the href. itemhref = re.sub(r'article_\d+/', '', itemhref) self.href_offsets[itemhref].append(buf.tell()) buf.write(b'0000000000') buf.write(b' ><font size="+1"><b><u>') t = tocitem.title if isinstance(t, str): t = t.encode('utf-8') buf.write(t) buf.write(b'</u></b></font></a></li>') buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />')
def serialize_item(self, item): ''' Serialize an individual item from the spine of the input document. A reference to this item is stored in self.href_offsets ''' buf = self.buf if not item.linear: self.breaks.append(buf.tell() - 1) self.id_offsets[base.urlnormalize(item.href)] = buf.tell() if item.is_section_start: buf.write(b'<a ></a> ') if item.is_article_start: buf.write(b'<a ></a> <a ></a>') for elem in item.data.find(base.tag('xhtml', 'body')): self.serialize_elem(elem, item) if self.write_page_breaks_after_item: buf.write(b'<mbp:pagebreak/>') if item.is_article_end: # Kindle periodical article end marker buf.write(b'<a ></a> <a ></a>') if item.is_section_end: buf.write(b' <a ></a>') self.anchor_offset = None
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): """ This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. """ elem = elem_tree # Ensure what we are converting is not a string and that the fist tag # is part of the XHTML namespace. if (not isinstance(elem_tree.tag, (str, bytes)) or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS): p = elem.getparent() if (p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close # the tags. tags = [] # First tag in tree tag = parse_utils.barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except Exception: ems = 0 # Convert TOC entries to <title>s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another # section, so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC # pointed to this page (then we use the first non-<body> on # the page as a <title>), or the TOC pointed to a specific # element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if (tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text): newlevel = 1 self.toc[page.href] = None if (not newlevel and elem_tree.attrib.get('id', None) is not None): newlevel = toc_entry.get( elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now # to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be only one XHTML # tag but it can have multiple styles. if tag == 'img' and elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack + tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line/>' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line/>' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack + tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'a' and elem_tree.attrib.get('href', None): # Handle only external links for now if urllib.parse.urlparse(elem_tree.attrib['href']).netloc: p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<a l:href="%s">' % base.urlnormalize(elem_tree.attrib['href'])) tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack + tags) fb2_out += s_out tags += s_tags if (tag in ('del', 'strike') or style['text-decoration'] == 'line-through'): s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but # before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out
def serialize_body(self): ''' Serialize all items in the spine of the document. Non linear items are moved to the end. ''' buf = self.buf def serialize_toc_level(tocref, href=None): # add the provided toc level to the output stream # if href is provided add a link ref to the toc level output (e.g. feed_0/index.html) if href is not None: # resolve the section url in id_offsets buf.write(b'<mbp:pagebreak />') self.id_offsets[base.urlnormalize(href)] = buf.tell() if tocref.klass == "periodical": buf.write(b'<div> <div height="1em"></div>') else: t = tocref.title if isinstance(t, str): t = t.encode('utf-8') buf.write( b'<div></div> <div> <h2 height="1em"><font size="+2"><b>' + t + b'</b></font></h2> <div height="1em"></div>') buf.write(b'<ul>') for tocitem in tocref.nodes: buf.write(b'<li><a filepos=') itemhref = tocitem.href if tocref.klass == 'periodical': # This is a section node. # For periodical tocs, the section urls are like r'feed_\d+/index.html' # We dont want to point to the start of the first article # so we change the href. itemhref = re.sub(r'article_\d+/', '', itemhref) self.href_offsets[itemhref].append(buf.tell()) buf.write(b'0000000000') buf.write(b' ><font size="+1"><b><u>') t = tocitem.title if isinstance(t, str): t = t.encode('utf-8') buf.write(t) buf.write(b'</u></b></font></a></li>') buf.write(b'</ul><div height="1em"></div></div><mbp:pagebreak />') self.anchor_offset = buf.tell() buf.write(b'<body>') self.body_start_offset = buf.tell() if self.is_periodical: top_toc = self.oeb.toc.nodes[0] serialize_toc_level(top_toc) spine = [item for item in self.oeb.spine if item.linear] spine.extend([item for item in self.oeb.spine if not item.linear]) for item in spine: if self.is_periodical and item.is_section_start: for section_toc in top_toc.nodes: if base.urlnormalize(item.href) == section_toc.href: # create section url of the form r'feed_\d+/index.html' section_url = re.sub(r'article_\d+/', '', section_toc.href) serialize_toc_level(section_toc, section_url) section_toc.href = section_url break self.serialize_item(item) self.body_end_offset = buf.tell() buf.write(b'</body>')
def _toc_from_navpoint(self, item, toc, navpoint): children = base.xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) href = base.xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'): # This node is useless continue if href and href[0]: href = item.abshref(base.urlnormalize(href[0])) else: href = '' path, _ = urllib.parse.urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = base.urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = base.xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except Exception: po = self.oeb.toc.next_play_order() authorElement = base.xpath( child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = base.xpath( child, 'descendant::calibre:meta[@name = ' '"description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding='unicode').strip() if not description: description = None else: description = None index_image = base.xpath( child, 'descendant::calibre:meta[@name = ' '"toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from ebook_converter.ebooks.conversion.plumber import create_oebbook from ebook_converter.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES, xpath, urlquote) from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from ebook_converter.ebooks.html.input import get_filelist from ebook_converter.ebooks.metadata import string_to_authors from ebook_converter.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate('Unknown')] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urllib.parse.urldefrag self.BINARY_MIME = BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, functools.partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, functools.partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True) return oeb
def convert_epub3_nav(self, nav_path, opf, log, opts): from lxml import etree from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.oeb.polish.parsing import parse from ebook_converter.ebooks.oeb.base import \ serialize from ebook_converter.ebooks.oeb.polish.toc import first_child from tempfile import NamedTemporaryFile with open(nav_path, 'rb') as f: raw = f.read() raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] root = parse(raw, log=log) ncx = etree.fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/' 'ncx/" version="2005-1" xml:lang="eng">' '<navMap/></ncx>') navmap = ncx[0] et = '{%s}type' % const.EPUB_NS bn = os.path.basename(nav_path) def add_from_li(li, parent): href = text = None for x in li.iterchildren(base.tag('xhtml', 'a'), base.tag('xhtml', 'span')): text = etree.tostring( x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: if href.startswith('#'): href = bn + href break np = parent.makeelement(base.tag('ncx', 'navPoint')) parent.append(np) np.append(np.makeelement(base.tag('ncx', 'navLabel'))) np[0].append(np.makeelement(base.tag('ncx', 'text'))) np[0][0].text = text if href: np.append( np.makeelement(base.tag('ncx', 'content'), attrib={'src': href})) return np def process_nav_node(node, toc_parent): for li in node.iterchildren(base.tag('xhtml', 'li')): child = add_from_li(li, toc_parent) ol = first_child(li, base.tag('xhtml', 'ol')) if child is not None and ol is not None: process_nav_node(ol, child) for nav in root.iterdescendants(base.tag('xhtml', 'nav')): if nav.get(et) == 'toc': ol = first_child(nav, base.tag('xhtml', 'ol')) if ol is not None: process_nav_node(ol, navmap) break else: return with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') ncx_id = opf.create_manifest_item(ncx_href, base.NCX_MIME, append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) url = os.path.relpath(nav_path).replace(os.sep, '/') opts.epub3_nav_href = base.urlnormalize(url) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] link_path = (os.path.relpath( os.path.join(base_path, urllib.parse.unquote(href)), base_path)) abs_href = base.urlnormalize(link_path) if abs_href == self.removed_cover: changed = True elem.set('data-calibre-removed-titlepage', '1') if changed: with open(nav_path, 'wb') as f: f.write(base.serialize(root, 'application/xhtml+xml'))
def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) != const.XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or style['visibility'] == 'hidden') and elem.get('data-calibre-jacket-searchable-tags', None) != '1'): id_ = elem.get('id', None) if id_: # Keep anchors so people can use display:none # to generate hidden TOCs tail = elem.tail elem.clear() elem.text = None elem.set('id', id_) elem.tail = tail elem.tag = base.tag('xhtml', 'a') else: return tag = parse_utils.barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 if tag == 'ol' and 'start' in elem.attrib: try: istate.list_num = int(elem.attrib['start'])-1 except: pass istates.append(istate) left = 0 display = style['display'] if display == 'table-cell': display = 'inline' elif display.startswith('table'): display = 'block' isblock = (not display.startswith('inline') and style['display'] != 'none') isblock = isblock and style['float'] == 'none' isblock = isblock and tag != 'br' if isblock: bstate.para = None istate.halign = style['text-align'] rawti = style._get('text-indent') istate.indent = style['text-indent'] if hasattr(rawti, 'strip') and '%' in rawti: # We have a percentage text indent, these can come out looking # too large if the user chooses a wide output profile like # tablet istate.indent = min(style._unit_convert(rawti, base=500), istate.indent) if style['margin-left'] == 'auto' \ and style['margin-right'] == 'auto': istate.halign = 'center' margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) if tag != 'body': left = margin + padding istate.left += left vmargin = asfloat(style['margin-top']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-top']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding elif not istate.href: margin = asfloat(style['margin-left']) padding = asfloat(style['padding-left']) lspace = margin + padding if lspace > 0: spaces = int(round((lspace * 3) / style['font-size'])) elem.text = ('\xa0' * spaces) + (elem.text or '') margin = asfloat(style['margin-right']) padding = asfloat(style['padding-right']) rspace = margin + padding if rspace > 0: spaces = int(round((rspace * 3) / style['font-size'])) if len(elem) == 0: elem.text = (elem.text or '') + ('\xa0' * spaces) else: last = elem[-1] last.text = (last.text or '') + ('\xa0' * spaces) if bstate.content and style['page-break-before'] in PAGE_BREAKS: bstate.pbreak = True istate.fsize = self.mobimlize_font(style['font-size']) istate.italic = True if style['font-style'] == 'italic' else False weight = style['font-weight'] istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400 istate.preserve = style['white-space'] == 'pre' istate.pre_wrap = style['white-space'] == 'pre-wrap' istate.bgcolor = style['background-color'] istate.fgcolor = style['color'] istate.strikethrough = style.effective_text_decoration == 'line-through' istate.underline = style.effective_text_decoration == 'underline' ff = style['font-family'].lower() if hasattr(style['font-family'], 'lower') else '' if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'): istate.family = 'monospace' elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or 'arial' in ff or 'helvetica' in ff): istate.family = 'sans-serif' else: istate.family = 'serif' if 'id' in elem.attrib: istate.ids.add(elem.attrib['id']) if 'name' in elem.attrib: istate.ids.add(elem.attrib['name']) if tag == 'a' and 'href' in elem.attrib: istate.href = elem.attrib['href'] istate.attrib.clear() if tag == 'img' and 'src' in elem.attrib: istate.attrib['src'] = elem.attrib['src'] istate.attrib['align'] = 'baseline' cssdict = style.cssdict() valign = cssdict.get('vertical-align', None) if valign in ('top', 'bottom', 'middle'): istate.attrib['align'] = valign for prop in ('width', 'height'): if cssdict[prop] != 'auto': value = style[prop] if value == getattr(self.profile, prop): result = '100%' else: # Amazon's renderer does not support # img sizes in units other than px # See #7520 for test case try: pixs = int(round(float(value) / (72/self.profile.dpi))) except: continue result = str(pixs) istate.attrib[prop] = result if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: item = self.oeb.manifest.hrefs[base.urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) else: try: width, height = identify(item.data)[1:] except Exception: self.oeb.logger.warn('Invalid image:', href) else: if 'width' not in istate.attrib and 'height' not in \ istate.attrib: istate.attrib['width'] = str(width) istate.attrib['height'] = str(height) else: ar = width / height if 'width' not in istate.attrib: try: width = int(istate.attrib['height'])*ar except: pass istate.attrib['width'] = str(int(width)) else: try: height = int(istate.attrib['width'])/ar except: pass istate.attrib['height'] = str(int(height)) item.unload_data_from_memory() elif tag == 'hr' and asfloat(style['width']) > 0 and style._get('width') not in {'100%', 'auto'}: raww = style._get('width') if hasattr(raww, 'strip') and '%' in raww: istate.attrib['width'] = raww else: prop = style['width'] / self.profile.width istate.attrib['width'] = "%d%%" % int(round(prop * 100)) elif display == 'table': tag = 'table' elif display == 'table-row': tag = 'tr' elif display == 'table-cell': tag = 'td' if tag in TABLE_TAGS and self.ignore_tables: tag = 'span' if tag == 'td' else 'div' if tag in ('table', 'td', 'tr'): col = style.backgroundColor if col: elem.set('bgcolor', col) css = style.cssdict() if 'border' in css or 'border-width' in css: elem.set('border', '1') if tag in TABLE_TAGS: for attr in ('rowspan', 'colspan', 'width', 'border', 'scope', 'bgcolor'): if attr in elem.attrib: istate.attrib[attr] = elem.attrib[attr] if tag == 'q': t = elem.text if not t: t = '' elem.text = '\u201c' + t t = elem.tail if not t: t = '' elem.tail = '\u201d' + t text = None if elem.text: if istate.preserve or istate.pre_wrap: text = elem.text elif (len(elem) > 0 and isspace(elem.text) and hasattr(elem[0].tag, 'rpartition') and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS): text = None else: text = COLLAPSE.sub(' ', elem.text) valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom', 'top', 'bottom') or ( isinstance(valign, numbers.Number) and abs(valign) != 0) issup = valign in ('super', 'text-top', 'top') or ( isinstance(valign, numbers.Number) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body'))) vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: istates.pop() if len(istates) == 0: istates.append(FormatState()) at_start = bstate.para is None if at_start: self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: vtag = etree.SubElement(parent, base.tag('xhtml', vtag)) vtag = etree.SubElement(vtag, base.tag('xhtml', 'small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: vtag.append(child) else: break if vbstate.para is not None: if vbstate.para.text: vtag.text = vbstate.para.text for child in vbstate.para: vtag.append(child) return if tag == 'blockquote': old_mim = self.opts.mobi_ignore_margins self.opts.mobi_ignore_margins = False if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or ( # We have an id but no text and no children, the id should still # be added. istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and len(elem)==0)): if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib: try: value = int(elem.attrib['value']) istates[-2].list_num = value - 1 except: pass self.mobimlize_content(tag, text, bstate, istates) for child in elem: self.mobimlize_elem(child, stylizer, bstate, istates) tail = None if child.tail: if istate.preserve or istate.pre_wrap: tail = child.tail elif bstate.para is None and isspace(child.tail): tail = None else: tail = COLLAPSE.sub(' ', child.tail) if tail: self.mobimlize_content(tag, tail, bstate, istates) if tag == 'blockquote': self.opts.mobi_ignore_margins = old_mim if bstate.content and style['page-break-after'] in PAGE_BREAKS: bstate.pbreak = True if isblock: para = bstate.para if para is not None and para.text == '\xa0' and len(para) < 1: if style.height > 2: para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) bstate.para = None bstate.istate = None vmargin = asfloat(style['margin-bottom']) bstate.vmargin = max((bstate.vmargin, vmargin)) vpadding = asfloat(style['padding-bottom']) if vpadding > 0: bstate.vpadding += bstate.vmargin bstate.vmargin = 0 bstate.vpadding += vpadding if bstate.nested and bstate.nested[-1].tag == elem.tag: bstate.nested.pop() istates.pop()
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from ebook_converter.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add css_parser parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) for elem in style_tags: if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else '' for x in elem: t = getattr(x, 'text', None) if t: text += '\n\n' + uenc.force_unicode(t, 'utf-8') t = getattr(x, 'tail', None) if t: text += '\n\n' + uenc.force_unicode(t, 'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warning('Ignoring missing ' 'stylesheet in @import ' 'rule: %s', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in base.OEB_STYLES: self.logger.warning('CSS @import of non-CSS ' 'file %r', rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media')) ): href = base.urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warning('Stylesheet %r referenced by file %r ' 'not in manifest', path, item.href) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warning('Stylesheet %r referenced by file %r ' 'is not CSS', path, item.href) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except Exception: self.logger.exception('Failed to parse %s, ignoring.', w) self.logger.debug('Bad css: %s', x) # using oeb to store the rules, page rule and font face rules # and generating them again if opts, profile or stylesheets are different if (not hasattr(self.oeb, 'stylizer_rules')) \ or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets): self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets) self.rules = self.oeb.stylizer_rules.rules self.page_rule = self.oeb.stylizer_rules.page_rule self.font_face_rules = self.oeb.stylizer_rules.font_face_rules self.flatten_style = self.oeb.stylizer_rules.flatten_style self._styles = {} pseudo_pat = re.compile(':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in self.rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: ' '%r (%s)', text, err) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {'mobi', 'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = str(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = ''.join(punctuation_chars) + \ (text[0] if text else '') span = x.makeelement('{%s}span' % const.XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in base.xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in base.xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)