def _clean_opf(self, opf): nsmap = {} for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and ':' not in parse_utils.barename(elem.tag)): elem.tag = base.tag('opf', parse_utils.barename(elem.tag)) nsmap.update(const.OPF2_NSMAP) attrib = dict(opf.attrib) nroot = etree.Element(base.tag('opf', 'package'), nsmap={None: const.OPF2_NS}, attrib=attrib) metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'), nsmap=nsmap) ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata')) for elem in base.xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue if parse_utils.namespace(elem.tag) in const.DC_NSES: tag = parse_utils.barename(elem.tag).lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() elem.tag = '{%s}%s' % (const.DC11_NS, tag) metadata.append(elem) for element in base.xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): for element in base.xpath(opf, tag): nroot.append(element) return nroot
def _toc_from_spine(self, opf): self.log.warn('Generating default TOC from spine...') toc = self.oeb.toc titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(base.xpath(html, expr % tag)) header = base.COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) return True
def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn('Spine item %r not found' % idref) continue item = manifest.ids[idref] if (item.media_type.lower() in base.OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx')): spine.add(item, elem.get('linear')) else: if (hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html')): item.media_type = base.XHTML_MIME spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' ' Removing it from spine.' % item.href) if len(spine) == 0: raise base.OEBError("Spine is empty") self._spine_add_extra() for val in base.xpath( opf, '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val
def _toc_from_html(self, opf): if 'toc' not in self.oeb.guide: return False self.log.debug('Reading TOC from HTML...') itempath, frag = urllib.parse.urldefrag(self.oeb.guide['toc'].href) item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: elems = base.xpath(html, './/*[@id="%s"]' % frag) if not elems: elems = base.xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html while elem != html and not base.xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem titles = collections.defaultdict(list) order = [] for anchor in base.xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] href = item.abshref(base.urlnormalize(href)) path, frag = urllib.parse.urldefrag(href) if path not in self.oeb.manifest.hrefs: continue title = base.xml2text(anchor) title = base.COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) toc = self.oeb.toc for href in order: toc.add(' '.join(titles[href]), href) return True
def _toc_from_ncx(self, item): if (item is None) or (item.data is None): return False self.log.debug('Reading TOC from NCX...') ncx = item.data title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) title = title or str(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title navmaps = base.xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True
def _manifest_from_opf(self, opf): manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) if media_type is None: media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = mimetypes.guess_type(href)[0] media_type = guessed or media_type or base.BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') if href in manifest.hrefs: self.logger.warn('Duplicate manifest entry for %r' % href) continue if not self.oeb.container.exists(href): self.logger.warn('Manifest item %r not found' % href) continue if id in manifest.ids: self.logger.warn('Duplicate manifest id %r' % id) id, href = manifest.generate(id, href) manifest.add(id, href, media_type, fallback) invalid = self._manifest_prune_invalid() self._manifest_add_missing(invalid)
def rasterize_item(self, item): html = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(html, '//h:img[@src]'): src = urlnormalize(elem.attrib['src']) image = hrefs.get(item.abshref(src), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//h:object[@type="%s" and @data]' % SVG_MIME): data = urlnormalize(elem.attrib['data']) image = hrefs.get(item.abshref(data), None) if image and image.media_type == SVG_MIME: style = self.stylizer(item).style(elem) self.rasterize_external(elem, style, item, image) for elem in xpath(html, '//svg:svg'): style = self.stylizer(item).style(elem) self.rasterize_inline(elem, style, item)
def _toc_from_tour(self, opf): result = base.xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') sites = base.xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue path, _ = urllib.parse.urldefrag(base.urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue id = site.get('id') toc.add(title, href, id=id) return True
def _pages_from_ncx(self, opf, item): if item is None: return False ncx = item.data if ncx is None: return False ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) name = base.COLLAPSE_RE.sub(' ', name.strip()) href = base.xpath(ptarget, 'ncx:content/@src') if not href: continue href = item.abshref(base.urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') pages.add(name, href, type=type, id=id, klass=klass) return True
def _find_page_map(self, opf): result = base.xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: return None item = self.oeb.manifest.ids[id] self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): if item.media_type == base.PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None
def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: if href.lower() == path.lower(): corrected_href = href break if corrected_href is None: self.logger.warn('Guide reference %r not found' % ref_href) continue ref_href = corrected_href typ = elem.get('type') if typ not in guide: guide.add(typ, elem.get('title'), ref_href)
def dataize_svg(self, item, svg=None): if svg is None: svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue abshref = item.abshref(path) if abshref not in hrefs: continue linkee = hrefs[abshref] data = linkee.bytes_representation ext = what(None, data) or 'jpg' with PersistentTemporaryFile(suffix='.' + ext) as pt: pt.write(data) self.temp_files.append(pt.name) elem.attrib[base.tag('xlink', 'href')] = pt.name return svg
def _pages_from_page_map(self, opf): item = self._find_page_map(opf) if item is None: return False pmap = item.data pages = self.oeb.pages for page in base.xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue name = base.COLLAPSE_RE.sub(' ', name.strip()) href = item.abshref(base.urlnormalize(href)) type = 'normal' if not name: type = 'special' elif name.lower().strip('ivxlcdm') == '': type = 'front' pages.add(name, href, type=type) return True
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from ebook_converter.ebooks.conversion.plumber import create_oebbook from ebook_converter.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, BINARY_MIME, OEB_STYLES, xpath, urlquote) from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from ebook_converter.ebooks.html.input import get_filelist from ebook_converter.ebooks.metadata import string_to_authors from ebook_converter.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate('Unknown')] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate('Unknown')) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urllib.parse.urldefrag self.BINARY_MIME = BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, functools.partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, functools.partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwd(), oeb.log, ignore_opf=True) return oeb
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from ebook_converter.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add css_parser parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) for elem in style_tags: if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else '' for x in elem: t = getattr(x, 'text', None) if t: text += '\n\n' + uenc.force_unicode(t, 'utf-8') t = getattr(x, 'tail', None) if t: text += '\n\n' + uenc.force_unicode(t, 'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warning('Ignoring missing ' 'stylesheet in @import ' 'rule: %s', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in base.OEB_STYLES: self.logger.warning('CSS @import of non-CSS ' 'file %r', rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media')) ): href = base.urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warning('Stylesheet %r referenced by file %r ' 'not in manifest', path, item.href) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warning('Stylesheet %r referenced by file %r ' 'is not CSS', path, item.href) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except Exception: self.logger.exception('Failed to parse %s, ignoring.', w) self.logger.debug('Bad css: %s', x) # using oeb to store the rules, page rule and font face rules # and generating them again if opts, profile or stylesheets are different if (not hasattr(self.oeb, 'stylizer_rules')) \ or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets): self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets) self.rules = self.oeb.stylizer_rules.rules self.page_rule = self.oeb.stylizer_rules.page_rule self.font_face_rules = self.oeb.stylizer_rules.font_face_rules self.flatten_style = self.oeb.stylizer_rules.flatten_style self._styles = {} pseudo_pat = re.compile(':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in self.rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: ' '%r (%s)', text, err) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {'mobi', 'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = str(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = ''.join(punctuation_chars) + \ (text[0] if text else '') span = x.makeelement('{%s}span' % const.XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in base.xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in base.xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def _toc_from_navpoint(self, item, toc, navpoint): children = base.xpath(navpoint, 'ncx:navPoint') for child in children: title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()')) title = base.COLLAPSE_RE.sub(' ', title.strip()) href = base.xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'): # This node is useless continue if href and href[0]: href = item.abshref(base.urlnormalize(href[0])) else: href = '' path, _ = urllib.parse.urldefrag(href) if path and path not in self.oeb.manifest.hrefs: path = base.urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) gc = base.xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue id = child.get('id') klass = child.get('class', 'chapter') try: po = int(child.get('playOrder', self.oeb.toc.next_play_order())) except Exception: po = self.oeb.toc.next_play_order() authorElement = base.xpath( child, 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None descriptionElement = base.xpath( child, 'descendant::calibre:meta[@name = ' '"description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], method='text', encoding='unicode').strip() if not description: description = None else: description = None index_image = base.xpath( child, 'descendant::calibre:meta[@name = ' '"toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, play_order=po, description=description, author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child)