def __call__(self, oeb, opts): import css_parser self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): css_parser.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in self.oeb.guide.values(): href = urlnormalize(ref.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def test_replaceUrls(self): "css_parser.replaceUrls()" css_parser.ser.prefs.keepAllProperties = True css = r''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = css_parser.parseString(css) css_parser.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual('@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual('NEWim2', s.cssRules[1].href) self.assertEqual( '''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) css_parser.ser.prefs.keepAllProperties = False # CSSStyleDeclaration style = css_parser.parseStyle('''color: red; background-image: url(1.png), url('2.png')''') css_parser.replaceUrls(style, lambda url: 'prefix/' + url) self.assertEqual( style.cssText, '''color: red; background-image: url(prefix/1.png), url(prefix/2.png)''')
def transform_style_sheet(container, name, link_uid, virtualize_resources, virtualized_names): changed = False sheet = container.parsed(name) if virtualize_resources: changed_names = set() link_replacer = create_link_replacer(container, link_uid, changed_names) replaceUrls(sheet, partial(link_replacer, name)) if name in changed_names: changed = True virtualized_names.add(name) if transform_sheet(sheet): changed = True if changed: raw = container.serialize_item(name) else: raw = container.raw_data(name, decode=False) raw = raw.lstrip() if not raw.startswith(b'@charset'): raw = b'@charset "UTF-8";\n' + raw changed = True if changed: with container.open(name, 'wb') as f: f.write(raw)
def transform_and_virtualize_sheet(sheet): changed = transform_sheet(sheet) if virtualize_resources: replaceUrls(sheet, partial(link_replacer, name)) if name in changed_names: virtualized_names.add(name) changed = True return changed
def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.context = self.opts = context self.oeb = oeb self.items = list(self.oeb.spine) titlepage = self.oeb.guide.get('titlepage') if titlepage is not None: titlepage = titlepage.item if titlepage is not None and titlepage not in self.items: self.items.append(titlepage) epub3_nav = None if getattr(self.opts, 'epub3_nav_href', None): epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href) if epub3_nav is not None and epub3_nav not in self.items: self.items.append(epub3_nav) self.filter_css = frozenset() if self.opts.filter_css: try: self.filter_css = { x.strip().lower() for x in self.opts.filter_css.split(',') } except: self.oeb.log.warning('Failed to parse filter_css, ignoring') else: from calibre.ebooks.oeb.normalize_css import normalize_filter_css self.filter_css = frozenset( normalize_filter_css(self.filter_css)) self.oeb.log.debug('Filtering CSS properties: %s' % ', '.join(self.filter_css)) for item in oeb.manifest.values(): # Make all links to resources absolute, as these sheets will be # consolidated into a single stylesheet at the root of the document if item.media_type in OEB_STYLES: css_parser.replaceUrls(item.data, item.abshref, ignoreImportRules=True) self.body_font_family, self.embed_font_rules = self.get_embed_font_info( self.opts.embed_font_family) # Store for use in output plugins/transforms that generate content, # like the AZW3 output inline ToC. self.oeb.store_embed_font_rules = EmbedFontsCSSRules( self.body_font_family, self.embed_font_rules) self.stylize_spine() self.sbase = self.baseline_spine() if self.fbase else None self.fmap = FontMapper(self.sbase, self.fbase, self.fkey) self.flatten_spine() if epub3_nav is not None: self.opts.epub3_nav_parsed = epub3_nav.data self.store_page_margins()
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') link_replacer = create_link_replacer(self, link_uid, changed) ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') elif mt == 'image/svg+xml': self.virtualized_names.add(name) xlink = XLINK('href') altered = False for elem in xlink_xpath(self.parsed(name)): href = elem.get(xlink) if not href.startswith('#'): elem.set(xlink, link_replacer(name, href)) altered = True if altered: changed.add(name) tuple(map(self.dirty, changed))
def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.context = self.opts = context self.oeb = oeb self.items = list(self.oeb.spine) titlepage = self.oeb.guide.get('titlepage') if titlepage is not None: titlepage = titlepage.item if titlepage is not None and titlepage not in self.items: self.items.append(titlepage) epub3_nav = None if getattr(self.opts, 'epub3_nav_href', None): epub3_nav = self.oeb.manifest.hrefs.get(self.opts.epub3_nav_href) if epub3_nav is not None and epub3_nav not in self.items: self.items.append(epub3_nav) self.filter_css = frozenset() if self.opts.filter_css: try: self.filter_css = {x.strip().lower() for x in self.opts.filter_css.split(',')} except: self.oeb.log.warning('Failed to parse filter_css, ignoring') else: from calibre.ebooks.oeb.normalize_css import normalize_filter_css self.filter_css = frozenset(normalize_filter_css(self.filter_css)) self.oeb.log.debug('Filtering CSS properties: %s'% ', '.join(self.filter_css)) for item in oeb.manifest.values(): # Make all links to resources absolute, as these sheets will be # consolidated into a single stylesheet at the root of the document if item.media_type in OEB_STYLES: css_parser.replaceUrls(item.data, item.abshref, ignoreImportRules=True) self.body_font_family, self.embed_font_rules = self.get_embed_font_info( self.opts.embed_font_family) # Store for use in output plugins/transforms that generate content, # like the AZW3 output inline ToC. self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family, self.embed_font_rules) self.stylize_spine() self.sbase = self.baseline_spine() if self.fbase else None self.fmap = FontMapper(self.sbase, self.fbase, self.fkey) self.flatten_spine() if epub3_nav is not None: self.opts.epub3_nav_parsed = epub3_nav.data self.store_page_margins()
def _apply_style_attr(self, url_replacer=None): attrib = self._element.attrib if 'style' not in attrib: return css = attrib['style'].split(';') css = filter(None, (x.strip() for x in css)) css = [y.strip() for y in css] css = [y for y in css if self.MS_PAT.match(y) is None] css = '; '.join(css) try: style = parseStyle(css, validate=False) except CSSSyntaxError: return if url_replacer is not None: replaceUrls(style, url_replacer, ignoreImportRules=True) self._style.update(self._stylizer.flatten_style(style))
def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx - 1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s' % ( idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s' % idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in tag.attrib.iteritems(): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = css_parser.parseString(tag.text, validate=False) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n' + repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True)
def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx-1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s'%(idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s'%idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in iteritems(tag.attrib): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = css_parser.parseString(tag.text, validate=False) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n'+ repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) css_parser.replaceUrls(sheet, replacer, ignoreImportRules=True)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add css_parser parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) ): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: if media_ok(rule.media.mediaText): for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort(key=itemgetter(0)) # sort by specificity self.rules = rules self._styles = {} pseudo_pat = re.compile(u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode_type(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = x.makeelement('{%s}span' % XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn(u'Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls( item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name_and_is_not_empty(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode_type): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') img_xpath = XPath('//h:img[@src]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name_and_is_not_empty(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode_type): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in iteritems(self.mime_map): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for img in img_xpath(root): img_name = self.href_to_name(img.get('src'), name) if img_name: img.set('data-calibre-src', img_name) changed.add(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add css_parser parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else '' for x in elem: t = getattr(x, 'text', None) if t: text += '\n\n' + force_unicode(t, 'utf-8') t = getattr(x, 'tail', None) if t: text += '\n\n' + force_unicode(t, 'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn( 'Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn( 'CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get('rel', 'stylesheet').lower() == 'stylesheet' and elem.get('type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except Exception: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) # using oeb to store the rules, page rule and font face rules # and generating them again if opts, profile or stylesheets are different if (not hasattr(self.oeb, 'stylizer_rules')) \ or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets): self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile, stylesheets) self.rules = self.oeb.stylizer_rules.rules self.page_rule = self.oeb.stylizer_rules.page_rule self.font_face_rules = self.oeb.stylizer_rules.font_face_rules self.flatten_style = self.oeb.stylizer_rules.flatten_style self._styles = {} pseudo_pat = re.compile( ':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in self.rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error( f'Ignoring CSS rule with invalid selector: {text!r} ({as_unicode(err)})' ) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr( self.oeb, 'plumber_output_format', '').lower() in {'mobi', 'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = str(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = ''.join(punctuation_chars) + \ (text[0] if text else '') span = x.makeelement('{%s}span' % XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)