def __call__(self, oeb, opts): import cssutils self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): cssutils.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in self.oeb.guide.values(): href = urlnormalize(ref.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css=''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual(u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False # CSSStyleDeclaration style = cssutils.parseStyle(u'''color: red; background-image: url(1.png), url('2.png')''') cssutils.replaceUrls(style, lambda url: 'prefix/'+url) self.assertEqual(style.cssText, u'''color: red; background-image: url(prefix/1.png), url(prefix/2.png)''')
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: url = 'missing:' + quote(name) changed.add(base) return url for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) a.set('data-' + link_uid, json.dumps({'name':parts[0], 'frag':parts[1]}, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed = False xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) tuple(map(self.dirty, changed))
def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.context = self.opts =context self.oeb = oeb self.filter_css = frozenset() if self.opts.filter_css: try: self.filter_css = {x.strip().lower() for x in self.opts.filter_css.split(',')} except: self.oeb.log.warning('Failed to parse filter_css, ignoring') else: from calibre.ebooks.oeb.normalize_css import normalize_filter_css self.filter_css = frozenset(normalize_filter_css(self.filter_css)) self.oeb.log.debug('Filtering CSS properties: %s'% ', '.join(self.filter_css)) for item in oeb.manifest.values(): # Make all links to resources absolute, as these sheets will be # consolidated into a single stylesheet at the root of the document if item.media_type in OEB_STYLES: cssutils.replaceUrls(item.data, item.abshref, ignoreImportRules=True) self.body_font_family, self.embed_font_rules = self.get_embed_font_info( self.opts.embed_font_family) # Store for use in output plugins/transforms that generate content, # like the AZW3 output inline ToC. self.oeb.store_embed_font_rules = EmbedFontsCSSRules(self.body_font_family, self.embed_font_rules) self.stylize_spine() self.sbase = self.baseline_spine() if self.fbase else None self.fmap = FontMapper(self.sbase, self.fbase, self.fkey) self.flatten_spine()
def update_internal_css(self, original_path, content): """Given the contents of the css file, parse out any urls so we have them locally. Note: that if cssutils is not installed it will return the content as is. """ def replace_url(url): """This is called with the cssutils.replaceUrls function""" parsed = urlparse.urlparse(original_path) # In the case where the path contains .., we make the url proper if url.startswith('..'): url = os.path.abspath( os.path.join( os.path.dirname(parsed.path), url)) # Build the full url with domain so we can download it url = '{}://{}{}'.format(parsed.scheme, parsed.netloc, url) # Generate the local file for the url return self.get_original_file(url) # We can parse the css urls if cssutils is available if cssutils: sheet = cssutils.parseString(content) cssutils.replaceUrls(sheet, replace_url) return sheet.cssText return content
def __call__(self, oeb, context): oeb.logger.info('Flattening CSS and remapping font sizes...') self.context = self.opts =context self.oeb = oeb self.filter_css = frozenset() if self.opts.filter_css: try: self.filter_css = frozenset([x.strip().lower() for x in self.opts.filter_css.split(',')]) except: self.oeb.log.warning('Failed to parse filter_css, ignoring') else: self.oeb.log.debug('Filtering CSS properties: %s'% ', '.join(self.filter_css)) for item in oeb.manifest.values(): # Make all links to resources absolute, as these sheets will be # consolidated into a single stylesheet at the root of the document if item.media_type in OEB_STYLES: cssutils.replaceUrls(item.data, item.abshref, ignoreImportRules=True) self.stylize_spine() self.sbase = self.baseline_spine() if self.fbase else None self.fmap = FontMapper(self.sbase, self.fbase, self.fkey) self.flatten_spine()
def run(self): # Step 0: ensure that the document_root and base_path variables are # set. If the file that's being processed was inside a source that has # either one or both not set, then this processor can't run. if self.document_root is None or self.base_path is None: raise DocumentRootAndBasePathRequiredException # We don't rename the file, so we can use the default output file. parser = CSSParser(log=None, loglevel=logging.CRITICAL) sheet = parser.parseFile(self.input_file) # Step 1: ensure the file has URLs. If it doesn't, we can stop the # processing. url_count = 0 for url in getUrls(sheet): url_count += 1 break if url_count == 0: return self.input_file # Step 2: resolve the relative URLs to absolute paths. replaceUrls(sheet, self.resolveToAbsolutePath) # Step 3: verify that each of these files has been synced. synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB) self.dbcon = sqlite3.connect(synced_files_db) self.dbcon.text_factory = unicode # This is the default, but we set it explicitly, just to be sure. self.dbcur = self.dbcon.cursor() all_synced = True for urlstring in getUrls(sheet): # Skip absolute URLs. if urlstring.startswith("http://") or urlstring.startswith("https://"): continue # Skip broken references in the CSS file. This would otherwise # prevent this CSS file from ever passing through this processor. if not os.path.exists(urlstring): continue # Get the CDN URL for the given absolute path. self.dbcur.execute("SELECT url FROM synced_files WHERE input_file=?", (urlstring,)) result = self.dbcur.fetchone() if result == None: raise RequestToRequeueException( "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server) ) else: cdn_url = result[0] # Step 4: resolve the absolute paths to CDN URLs. replaceUrls(sheet, self.resolveToCDNURL) # Step 5: write the updated CSS to the output file. f = open(self.output_file, "w") f.write(sheet.cssText) f.close() return self.output_file
def __call__(self, oeb, opts): import cssutils self.log = oeb.logger self.opts = opts self.oeb = oeb for item in oeb.manifest.items: self.current_item = item if etree.iselement(item.data): rewrite_links(self.current_item.data, self.url_replacer) elif hasattr(item.data, 'cssText'): cssutils.replaceUrls(item.data, self.url_replacer) if self.oeb.guide: for ref in list(self.oeb.guide.values()): href = urlnormalize(ref.href) href, frag = urldefrag(href) replacement = self.rename_map.get(href, None) if replacement is not None: nhref = replacement if frag: nhref += '#' + frag ref.href = nhref if self.oeb.toc: self.fix_toc_entry(self.oeb.toc)
def replace_urls(routes, filepath): stylesheet = cssutils.parseFile(filepath) cssutils.replaceUrls( stylesheet, ft.partial(routed_url, filepath, routes) ) return stylesheet.cssText
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css = ''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual( u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False # CSSStyleDeclaration style = cssutils.parseStyle(u'''color: red; background-image: url(1.png), url('2.png')''') cssutils.replaceUrls(style, lambda url: 'prefix/' + url) self.assertEqual( style.cssText, u'''color: red; background-image: url(prefix/1.png), url(prefix/2.png)''')
def copy_css(self): base_dir = os.path.abspath(os.path.dirname(self.css_file)) self.font_list = [] def replace_url(url): source_file = os.path.abspath(os.path.join(base_dir, url)) if os.path.splitext(url)[1].lower() in ('.ttf', '.otf'): dest_file = os.path.abspath(os.path.join(self.temp_content_dir, 'fonts', os.path.basename(source_file))) new_url = 'fonts/' + os.path.basename(url) self.font_list.append(new_url) else: dest_file = os.path.abspath( os.path.join(self.temp_content_dir, 'images', 'css_' + os.path.basename(source_file))) new_url = 'images/css_' + os.path.basename(url) try: copy_file(source_file, dest_file) except: self.log.error('File {0}, referred by css, not found.'.format(url)) return new_url if self.parse_css: cssutils.profile.addProfile('CSS extentions', {'-webkit-hyphens': 'none', 'adobe-hyphenate': 'none', '-moz-hyphens': 'none', '-ms-hyphens': 'none', 'hyphens': 'none|manual|auto'}) stylesheet = cssutils.parseFile(self.css_file) cssutils.replaceUrls(stylesheet, replace_url) write_file(str(stylesheet.cssText, 'utf-8'), os.path.join(self.temp_content_dir, 'stylesheet.css')) else: copy_file(self.css_file, os.path.join(self.temp_content_dir, 'stylesheet.css'))
def replace_urls_epub(epub_zip, routes, root_dir, filepath): style_string = epub_zip.read(os.path.join(root_dir, filepath)) stylesheet = cssutils.parseString(style_string) cssutils.replaceUrls( stylesheet, ft.partial(routed_url, filepath, routes) ) return stylesheet.cssText
def replaceurls(stylesheet, replacer): """ Replace all URLs appearing in the :class:`CSSStyleSheet` :obj:`stylesheet`. For each URL the function :obj:`replacer` will be called and the URL will be replaced with the result. """ def newreplacer(u): return str(replacer(url.URL(u))) cssutils.replaceUrls(stylesheet, newreplacer)
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) changed.add(base) return url for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) a.set('data-' + link_uid, json.dumps({'name':parts[0], 'frag':parts[1]}, ensure_ascii=False)) else: a.set('target', '_blank') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed = False xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) tuple(map(self.dirty, changed))
def finish(self): """ Parse the buffered response body, rewrite its URLs, write the result to the wrapped request, and finish the wrapped request. """ stylesheet = ''.join(self._buffer) parser = CSSParser() css = parser.parseString(stylesheet) replaceUrls(css, self._replace) self.request.write(css.cssText) return self.request.finish()
def download_css_imports(soup, url_parts, root_url, css_file, dst_folder, index_path): """ parse css file and download imported css to dst_folder """ if os.path.exists(dst_folder + "/" + css_file): logging.debug("processing css " + css_file + "...") tmp_url_parts = deepcopy(url_parts) css_text_file = open(dst_folder + "/" + css_file, "r+") input_file = css_text_file.read() encoding = get_encoding(input_file) if "UTF-8" not in encoding: input_file = input_file.decode(encoding).encode("UTF-8") try: sheet = cssutils.parseString(input_file) except Exception as e: logging.error("Unable to parse " + css_file) return css_download_url_refs(root_url, url_parts, sheet, dst_folder) index_depth = css_file.count("/") i_path = "" for i in range(index_depth): i_path += "../" org_file_names = [] new_file_names = [] for rule in sheet: if rule.type == rule.IMPORT_RULE: file_name = rule.href.split("/")[-1] file_name = sanitize_file_name(file_name) tmp_url_parts = list(urlparse.urlparse(rule.href)) new_src = create_directories(dst_folder, tmp_url_parts[1] + tmp_url_parts[2]) full_path = os.path.join(dst_folder, new_src) outpath = os.path.join(full_path, file_name) tmp_url_parts[2] = rule.href if rule.href.lower().startswith("http"): download_file(rule.href, outpath) else: download_file(urlparse.urlunparse(tmp_url_parts), outpath) rule.href = rule.href.replace("http://", "") rule.href = rule.href.replace("https://", "") rule.href = i_path + rule.href.replace("../", "") cssutils.replaceUrls(sheet, lambda url: css_url_replacer(url, new_file_names, org_file_names, i_path)) css_text_file.seek(0) css_text_file.write(sheet.cssText)
def css_fix(self, css, inline=False): # Disable cssusilt warnings and errors for imperfect css source cssutils.log.setLevel(logging.CRITICAL) if inline: declaration = cssutils.parseStyle(css) cssutils.replaceUrls(declaration, self.url_fix) rv = declaration.cssText else: # style tag or external stylesheet sheet = cssutils.parseString(css) cssutils.replaceUrls(sheet, self.url_fix) rv = sheet.cssText return rv
def _pack_css(css_path, css, root_dir): def replacer(resource_url): if not _can_encode(resource_url): return resource_url fullpath = _determine_fullpath(css_path, resource_url, root_dir) tag_mime, tag_data = _get_resource(fullpath) encoded_resource = make_data_uri(tag_mime, tag_data) return encoded_resource stylesheet = cssutils.parseString(css) cssutils.replaceUrls(stylesheet, replacer) return str.encode(css)
def replace(self, css): cssutils.log.setLevel(logging.CRITICAL) cssutils.cssproductions.MACROS[u'name'] = ur'[\*]?{nmchar}+' try: sheet = cssutils.parseString(css) except: sheet = cssutils.css.CSSStyleDeclaration(cssText=css) replacer = lambda url: reverse(u'viewer', args=(self.get_access_uri(url), )) cssutils.replaceUrls(sheet, replacer) return sheet.cssText
def _apply_style_attr(self, url_replacer=None): attrib = self._element.attrib if 'style' not in attrib: return css = attrib['style'].split(';') css = filter(None, (x.strip() for x in css)) css = [y.strip() for y in css] css = [y for y in css if self.MS_PAT.match(y) is None] css = '; '.join(css) try: style = parseStyle(css, validate=False) except CSSSyntaxError: return if url_replacer is not None: replaceUrls(style, url_replacer, ignoreImportRules=True) self._style.update(self._stylizer.flatten_style(style))
def replace_links(container, link_map, frag_map=lambda name, frag:frag): ncx_type = guess_type('toc.ncx') for name, media_type in container.mime_map.iteritems(): repl = LinkReplacer(name, container, link_map, frag_map) if media_type.lower() in OEB_DOCS: rewrite_links(container.parsed(name), repl) elif media_type.lower() in OEB_STYLES: replaceUrls(container.parsed(name), repl) elif media_type.lower() == ncx_type: for elem in container.parsed(name).xpath('//*[@src]'): src = elem.get('src') nsrc = repl(src) if src != nsrc: elem.set('src', nsrc) if repl.replaced: container.dirty(name)
def minify(source, output, **options): '''Minifies CSS from a file and outputs it to a different file. :type source: :class:`django.core.files.File` :type output: :class:`django.core.files.File` ''' ser.prefs.useMinified() base_path = getattr(source, 'path', source.name) stylesheet = parseString(source.read(), href='file://%s' % pathname2url(base_path)) css_files = get_recursive_imports(stylesheet, base=base_path) # concatenate the stylesheets minified = resolveImports(stylesheet) # replace static asset references with the "live" URLs replaceUrls(minified, replace_static_refs, ignoreImportRules=True) # generate minified CSS css = minified.cssText output.write(css) output.seek(0)
def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx - 1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s' % ( idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s' % idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in tag.attrib.iteritems(): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = cssutils.parseString(tag.text, validate=False) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n' + repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
def replace_resource_links(self): ''' Replace links to resources (raster images/fonts) with pointers to the MOBI record containing the resource. The pointers are of the form: kindle:embed:XXXX?mime=image/* The ?mime= is apparently optional and not used for fonts. ''' def pointer(item, oref): ref = urlnormalize(item.abshref(oref)) idx = self.resources.item_map.get(ref, None) if idx is not None: is_image = self.resources.records[idx-1][:4] not in {b'FONT'} idx = to_ref(idx) if is_image: self.used_images.add(ref) return 'kindle:embed:%s?mime=%s'%(idx, self.resources.mime_map[ref]) else: return 'kindle:embed:%s'%idx return oref for item in self.oeb.manifest: if item.media_type in XML_DOCS: root = self.data(item) for tag in XPath('//h:img|//svg:image')(root): for attr, ref in tag.attrib.iteritems(): if attr.split('}')[-1].lower() in {'src', 'href'}: tag.attrib[attr] = pointer(item, ref) for tag in XPath('//h:style')(root): if tag.text: sheet = cssutils.parseString(tag.text, validate=False) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) repl = sheet.cssText if isbytestring(repl): repl = repl.decode('utf-8') tag.text = '\n'+ repl + '\n' elif item.media_type in OEB_STYLES: sheet = self.data(item) replacer = partial(pointer, item) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True)
def replace_links(self, name, replace_func): ''' Replace all links in name using replace_func, which must be a callable that accepts a URL and returns the replaced URL. It must also have a 'replaced' attribute that is set to True if any actual replacement is done. Convenient ways of creating such callables are using the :class:`LinkReplacer` and :class:`LinkRebaser` classes. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): elem.set('href', replace_func(elem.get('href'))) elif media_type.lower() in OEB_DOCS: rewrite_links(self.parsed(name), replace_func) elif media_type.lower() in OEB_STYLES: replaceUrls(self.parsed(name), replace_func) elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): elem.set('src', replace_func(elem.get('src'))) if replace_func.replaced: self.dirty(name) return replace_func.replaced
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css=''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual(u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False
def test_replaceUrls(self): "cssutils.replaceUrls()" cssutils.ser.prefs.keepAllProperties = True css = ''' @import "im1"; @import url(im2); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) cssutils.replaceUrls(s, lambda old: "NEW" + old) self.assertEqual(u'@import "NEWim1";', s.cssRules[0].cssText) self.assertEqual(u'NEWim2', s.cssRules[1].href) self.assertEqual( u'''background-image: url(NEWc) !important; background-\\image: url(NEWb); background: url(NEWa) no-repeat !important''', s.cssRules[2].style.cssText) cssutils.ser.prefs.keepAllProperties = False
def cache_style_content(self, content, inline=False): """ Caches all required URI's and Imports. Returns, - updated css content """ if inline: sheet = cssutils.parseStyle(content) else: sheet = cssutils.parseString(content, href=self.url) if not inline: for rule in sheet.cssRules: if rule.type == rule.IMPORT_RULE: f = self._recursive_cache_resource(rule.styleSheet.href) rule.href = f def replacer(url): if url.startswith('data'): return url # TODOs: # Check for absolute url before joining return self._recursive_cache_resource(urljoin(self.url, url)) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) return sheet.cssText
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn(u'Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set('data-' + link_uid, json.dumps({'name':lname, 'frag':lfrag}, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in ltm.iteritems(): for k, v in tuple(amap.iteritems()): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
for x in t.tokenize(css): print x # # v = cssutils.css.URIValue(u'url(/**/1)') # print v.cssText # v.uri = 'uri' # print v.cssText # v.value = 'value' # print v.cssText sys.exit(1) if 1: # request by Walter style = cssutils.parseStyle("background-image: url(1.png), url('2.png')") cssutils.replaceUrls(style, lambda url: 'prefix/'+url) print style.cssText sys.exit(1) if 0: # ISSUE 35 css = """div.one {color: expression((function(ele){ele.style.behavior="none";})(this));} """ css = """div.one {color: expression(function(ele){ele.style.behavior="none";})(this);} """ sheet = cssutils.parseString(css) print sheet.cssText sys.exit(1)
def build(self): if os.path.exists(self.builddir): # make sure it doesn't exist shutil.rmtree(self.builddir) # favicons print("* Copying Favicons") shutil.copytree( f"{self.srcdir}/favicons", f"{self.builddir}/") # copy over favicons, create builddir # sass print("* Compiling SCSS") os.makedirs(os.path.join(self.builddir, "css")) ocsspath = os.path.join(self.builddir, "css", "main.css") with open(ocsspath, "w+") as css: css.write(sass.compile(filename="src/scss/styles.scss")) # copy over js print("* Moving JS") os.makedirs(os.path.join(self.builddir, "scripts")) shutil.copyfile(f"{self.srcdir}/scripts/main.js", f"{self.builddir}/scripts/main.js" ) # copy over favicons, create builddir # replace images cssFile = cssutils.parseFile(ocsspath) cssutils.replaceUrls( cssFile, lambda x: os.path.join( "..", self.file_hash( os.path.join("src", "scss", os.path.normpath(x)), "images" ), ).replace("\\", "/"), ignoreImportRules=True, ) with open(ocsspath, "wb") as css: css.write(cssFile.cssText) # html print("* Gathering HTML files to build") html_to_build = [] for dirpath, _, filenames in os.walk(self.srcdir): if dirpath.find("snippets") == -1 and dirpath.find( "projects") == -1: for filename in filenames: if filename == "index.html": html_to_build.append((dirpath, filename)) print("* Building HTML files") for path, filename in html_to_build: src_dir = path build_file = filename build_base = pathlib.Path(path) print(f" - src: {path} ({build_base.parts}) : {build_file}") build_base = pathlib.Path(*build_base.parts[1:]) build_base = os.path.join(self.builddir, build_base) main_page = os.path.samefile(self.srcdir, path) print(f" - build_base: {build_base}") print(f" - main_page: {main_page}\n") with PageBuilder( src_dir, build_base, build_file, self.file_hash, "" if main_page else "..", {}, ) as pb: with open(os.path.join(path, filename)) as f: data = f.read() pb.feed(data) # blog/projects print("* Building Blog") for page, subpage, prefix in [("blog", "posts", "../../..")]: pagedir = os.path.join(self.srcdir, page) if os.path.exists(pagedir): with open(os.path.join(pagedir, "data.json"), "r", encoding="utf-8") as f: posts = json.load(f) for i, post in enumerate(posts): # convert dates to [date] object for key in ["post_date", "update_date"]: if key in post: post[key] = date( *([int(x) for x in post[key].split("-")])) post[key] = post[key].strftime("%B %d, %Y") else: post[key] = None # create slug if "url" not in post: post["url"] = slugify(post["title"]) # author post["author"] = "Rahul Yesantharao" post["num"] = i + 1 # print("* POST DATA *") # print(posts) # print() postdest = os.path.join(os.path.join(self.builddir, page), subpage) if not os.path.exists(postdest): os.mkdir(postdest) for i in range(len(posts)): postdir = os.path.join(postdest, posts[i]["url"]) os.mkdir(postdir) print(f" - {posts[i]['url']} -> {pagedir}") with PageBuilder(pagedir, postdir, "index.html", self.file_hash, prefix, posts[i]) as pb: with open(os.path.join(pagedir, "post.html"), "r", encoding="utf-8") as f: data = f.read() pb.feed(data) # images and css (hashed files) cssfiles = [] jsfiles = [] print("* Moving hashed files") os.makedirs(os.path.join(self.builddir, "images")) for src in self.hashed_files: dst = self.hashed_files[src] dst = os.path.join(self.builddir, dst) if dst.endswith(".js"): jsfiles.append(dst) if dst.endswith(".css"): cssfiles.append(dst) print(f" - {src} -> {dst}") if not os.path.exists(dst): shutil.copyfile(src, dst) # delete unhashed css file os.remove(os.path.join(self.builddir, "css", "main.css")) os.remove(os.path.join(self.builddir, "scripts", "main.js")) # postprocess print("* Postprocessing!") print(cssfiles) print(jsfiles) os.system(f"bash postprocess.sh {cssfiles[0]} {jsfiles[0]}")
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) ): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: if media_ok(rule.media.mediaText): for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = x.makeelement('{%s}span' % XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = XHTML_CSS_NAMESPACE + text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') selector = get_css_selector(text) matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: if not unicodedata.category( text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def getSoupView(soup, css, url=''): """ soup a BeautifulSoup 4 object css a CSS StyleSheet string returns style view a dict of tuples """ sheet = cssutils.parseString(css,href=url) cssutils.replaceUrls(sheet,lambda u: urlparse.urljoin(url, u), ignoreImportRules=True) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media gens = [] for i_rule in sheet: if i_rule.type == i_rule.IMPORT_RULE: cssutils.replaceUrls(i_rule.styleSheet,lambda u: urlparse.urljoin(i_rule.href, u), ignoreImportRules=True) rules = (rule for rule in i_rule.styleSheet if rule.type == rule.STYLE_RULE) gens.append(rules) rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) if gens: import itertools gens.append(rules) rules = itertools.chain(*gens) for rule in rules: for selector in rule.selectorList: #log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml if ':' in selector.selectorText: continue #Ignore pseudo:classes because we can't use them, plus the match when we don't want them to on bs4 matching = soup.select(selector.selectorText) for element in matching: ID = id(element) if ID not in view: # add initial empty style declatation view[ID] = (element,cssutils.css.CSSStyleDeclaration()) # @UndefinedVariable specificities[ID] = {} for p in rule.style: # update style declaration if p not in view[ID][1]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[ID][1].setProperty(p.name, p.value, p.priority) specificities[ID][p.name] = selector.specificity #log(2, view[element].getProperty('color')) else: #log(2, view[element].getProperty('color')) sameprio = (p.priority == view[ID][1].getPropertyPriority(p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[ID][p.name]): # later, more specific or higher prio view[ID][1].setProperty(p.name, p.value, p.priority) return view
def run(self): # Step 0: ensure that the document_root and base_path variables are # set. If the file that's being processed was inside a source that has # either one or both not set, then this processor can't run. if self.document_root is None or self.base_path is None: raise DocumentRootAndBasePathRequiredException # We don't rename the file, so we can use the default output file. parser = CSSParser(log=None, loglevel=logging.critical) sheet = parser.parseFile(self.input_file) # Step 1: ensure the file has URLs. If it doesn't, we can stop the # processing. url_count = 0 for url in getUrls(sheet): url_count += 1 break if url_count == 0: return self.input_file # Step 2: resolve the relative URLs to absolute paths. replaceUrls(sheet, self.resolveToAbsolutePath) # Step 3: verify that each of these files has been synced. synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB) self.dbcon = sqlite3.connect(synced_files_db) self.dbcur = self.dbcon.cursor() all_synced = True for urlstring in getUrls(sheet): # Skip absolute URLs. if urlstring.startswith("http://") or urlstring.startswith( "https://"): continue # Skip broken references in the CSS file. This would otherwise # prevent this CSS file from ever passing through this processor. if not os.path.exists(urlstring): continue # Get the CDN URL for the given absolute path. self.dbcur.execute( "SELECT url FROM synced_files WHERE input_file=?", (urlstring, )) result = self.dbcur.fetchone() if result == None: raise RequestToRequeueException( "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server)) else: cdn_url = result[0] # Step 4: resolve the absolute paths to CDN URLs. replaceUrls(sheet, self.resolveToCDNURL) # Step 5: write the updated CSS to the output file. f = open(self.output_file, 'w') f.write(sheet.cssText) f.close() return self.output_file
for x in t.tokenize(css): print(x) # # v = cssutils.css.URIValue(u'url(/**/1)') # print v.cssText # v.uri = 'uri' # print v.cssText # v.value = 'value' # print v.cssText sys.exit(1) if 1: # request by Walter style = cssutils.parseStyle("background-image: url(1.png), url('2.png')") cssutils.replaceUrls(style, lambda url: 'prefix/' + url) print(style.cssText) sys.exit(1) if 0: # ISSUE 35 css = """div.one {color: expression((function(ele){ele.style.behavior="none";})(this));} """ css = """div.one {color: expression(function(ele){ele.style.behavior="none";})(this);} """ sheet = cssutils.parseString(css) print(sheet.cssText) sys.exit(1) if 1: css = """
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn( 'Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn( 'CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = { rule.media.item(i) for i in xrange(rule.media.length) } if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile( ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = XHTML_CSS_NAMESPACE + text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') selector = get_css_selector(text) matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: if not unicodedata.category(text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
log = logging.getLogger('csscache') handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging.ERROR) log.addHandler(handler) if options.minified: cssutils.ser.prefs.useMinified() # Create the parser parser = cssutils.CSSParser(log=log, raiseExceptions=True, parseComments=not options.minified, validate=False) try: # Parse the original file sheet = parser.parseFile(args[0]) except Exception, e: sys.stderr.write('Error: %s %s\n' % (css_path, e.args[0])) sys.exit(1) # Replace all the urls replacer = partial(cache_bust_replacer, options, css_path, img_rel_path) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) # print the new css sys.stdout.write(sheet.cssText) if __name__ == "__main__": main()
def rewrite_links (self, f): """ Rewrite all links using the function f. """ cssutils.replaceUrls (self.sheet, f)
def virtualize_resources(self): changed = set() link_uid = self.book_render_data['link_uid'] resource_template = link_uid + '|{}|' xlink_xpath = XPath('//*[@xl:href]') link_xpath = XPath('//h:a[@href]') res_link_xpath = XPath('//h:link[@href]') def link_replacer(base, url): if url.startswith('#'): frag = urlunquote(url[1:]) if not frag: return url changed.add(base) return resource_template.format(encode_url(base, frag)) purl = urlparse(url) if purl.netloc or purl.query: return url if purl.scheme and purl.scheme != 'file': return url if not purl.path or purl.path.startswith('/'): return url url, frag = purl.path, purl.fragment name = self.href_to_name(url, base) if name: if self.has_name(name): frag = urlunquote(frag) url = resource_template.format(encode_url(name, frag)) else: if isinstance(name, unicode): name = name.encode('utf-8') url = 'missing:' + force_unicode(quote(name), 'utf-8') changed.add(base) return url ltm = self.book_render_data['link_to_map'] for name, mt in self.mime_map.iteritems(): mt = mt.lower() if mt in OEB_STYLES: replaceUrls(self.parsed(name), partial(link_replacer, name)) self.virtualized_names.add(name) elif mt in OEB_DOCS: self.virtualized_names.add(name) root = self.parsed(name) for link in res_link_xpath(root): ltype = (link.get('type') or 'text/css').lower() rel = (link.get('rel') or 'stylesheet').lower() if ltype != 'text/css' or rel != 'stylesheet': # This link will not be loaded by the browser anyway # and will causes the resource load check to hang link.attrib.clear() changed.add(name) rewrite_links(root, partial(link_replacer, name)) for a in link_xpath(root): href = a.get('href') if href.startswith(link_uid): a.set('href', 'javascript:void(0)') parts = decode_url(href.split('|')[1]) lname, lfrag = parts[0], parts[1] ltm.setdefault(lname, {}).setdefault(lfrag or '', set()).add(name) a.set( 'data-' + link_uid, json.dumps({ 'name': lname, 'frag': lfrag }, ensure_ascii=False)) else: a.set('target', '_blank') a.set('rel', 'noopener noreferrer') changed.add(name) elif mt == 'image/svg+xml': self.virtualized_names.add(name) changed.add(name) xlink = XLINK('href') for elem in xlink_xpath(self.parsed(name)): elem.set(xlink, link_replacer(name, elem.get(xlink))) for name, amap in ltm.iteritems(): for k, v in tuple(amap.iteritems()): amap[k] = tuple(v) # needed for JSON serialization tuple(map(self.dirty, changed))
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css="", user_css=""): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == "default": self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + ".css" stylesheets = [html_css_stylesheet()] style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile["name"], profile["props"], profile["macros"]) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger("calibre.css")) self.font_face_rules = [] for elem in style_tags: if elem.tag == XHTML("style") and elem.get("type", CSS_MIME) in OEB_STYLES: text = elem.text if elem.text else u"" for x in elem: t = getattr(x, "text", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") t = getattr(x, "tail", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ("utf-8", b"")) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces["h"] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == "amzn-mobi": continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn("Ignoring missing stylesheet in @import rule:", rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn("CSS @import of non-CSS file %r" % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif ( elem.tag == XHTML("link") and elem.get("href") and elem.get("rel", "stylesheet").lower() == "stylesheet" and elem.get("type", CSS_MIME).lower() in OEB_STYLES ): href = urlnormalize(elem.attrib["href"]) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn("Stylesheet %r referenced by file %r not in manifest" % (path, item.href)) continue if not hasattr(sitem.data, "cssRules"): self.logger.warn("Stylesheet %r referenced by file %r is not CSS" % (path, item.href)) continue stylesheets.append(sitem.data) csses = {"extra_css": extra_css, "user_css": user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces["h"] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception("Failed to parse %s, ignoring." % w) self.logger.debug("Bad css: ") self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({"all", "screen", "amzn-kf8"}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index == 0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index == 0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur":(first-letter|first-line|link|hover|visited|active|focus|before|after)", re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), "") selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == "first-letter" and getattr(self.oeb, "plumber_output_format", "").lower() == u"mobi": # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {"P", "Z"}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u"".join(punctuation_chars) + (text[0] if text else u"") span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, "//h:*[@style]"): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r"[0-9.]+$") for elem in xpath(tree, "//h:img[@width or @height]"): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get("width", "auto") != "auto" or style._style.get("height", "auto") != "auto" if not is_styled: # Update img style dimension using width and height upd = {} for prop in ("width", "height"): val = elem.get(prop, "").strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += "px" upd[prop] = val if upd: style._update_cssdict(upd)
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import ( DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, ) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist import cssutils, logging cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: oeb.logger.warn("Language not specified") metadata.add("language", get_lang().replace("_", "-")) if not metadata.creator: oeb.logger.warn("Creator not specified") metadata.add("creator", self.oeb.translate(__("Unknown"))) if not metadata.title: oeb.logger.warn("Title not specified") metadata.add("title", self.oeb.translate(__("Unknown"))) bookid = str(uuid.uuid4()) metadata.add("identifier", bookid, id="uuid_id", scheme="uuid") for ident in metadata.identifier: if "id" in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id="html", href=ascii_filename(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, "text/html") item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log("Normalizing filename cases") for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log("Rewriting HTML links") for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = "".join(xpath(html, "/h:html/h:head/h:title/text()")) title = re.sub(r"\s+", " ", title.strip()) if title: titles.append(title) headers.append("(unlabled)") for tag in ("h1", "h2", "h3", "h4", "h5", "strong"): expr = "/h:html/h:body//h:%s[position()=1]/text()" header = "".join(xpath(html, expr % tag)) header = re.sub(r"\s+", " ", header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in izip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(os.getcwdu(), oeb.log, ignore_opf=True) return oeb
def create_oebbook(self, htmlpath, mi, encoding='utf-8', pretty_print=False): cssutils.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES basedir = os.path.dirname(htmlpath) html_preprocessor = HTMLPreProcessor() assert encoding oeb = OEBBook(html_preprocessor, pretty_print=pretty_print, input_encoding=encoding) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata) if not metadata.language: logging.warn('Language not specified') metadata.add('language', 'en') if not metadata.creator: logging.warn('Creator not specified') metadata.add('creator', 'Unknown') if not metadata.title: logging.warn('Title not specified') metadata.add('title', 'Unknown') bookid = str(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=bname) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} for path, href in list(htmlfile_map.items()): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, ignore_opf=True) item = oeb.manifest.hrefs[htmlfile_map[path]] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in list(oeb.manifest.values()): if item.media_type in self.OEB_STYLES: dpath = None for path, href in list(self.added_resources.items()): if href == item.href: dpath = os.path.dirname(path) break cssutils.replaceUrls(item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data for header in find_headers(html): headers.append((item, header)) for i, (item, header) in enumerate(headers): if not item.linear: continue tocid = 'tocid{}'.format(i) header.attrib['id'] = tocid link = '{}#{}'.format(item.href, tocid) toc.add(header.text, link) oeb.container = DirContainer(os.getcwd(), ignore_opf=True) return oeb
# Configure the logger log = logging.getLogger('csscache') handler = logging.StreamHandler(sys.stderr) handler.setLevel(logging.ERROR) log.addHandler(handler) if options.minified: cssutils.ser.prefs.useMinified() # Create the parser parser = cssutils.CSSParser(log=log, raiseExceptions=True, parseComments=not options.minified, validate=False) try: # Parse the original file sheet = parser.parseFile(args[0]) except Exception, e: sys.stderr.write('Error: %s %s\n' % (css_path, e.args[0])) sys.exit(1) # Replace all the urls replacer = partial(cache_bust_replacer, options, css_path, img_rel_path) cssutils.replaceUrls(sheet, replacer, ignoreImportRules=True) # print the new css sys.stdout.write(sheet.cssText) if __name__ == "__main__": main()
def getSoupView(soup, css, url=''): """ soup a BeautifulSoup 4 object css a CSS StyleSheet string returns style view a dict of tuples """ sheet = cssutils.parseString(css, href=url) cssutils.replaceUrls(sheet, lambda u: urlparse.urljoin(url, u), ignoreImportRules=True) view = {} specificities = {} # needed temporarily # TODO: filter rules simpler?, add @media gens = [] for i_rule in sheet: if i_rule.type == i_rule.IMPORT_RULE: cssutils.replaceUrls(i_rule.styleSheet, lambda u: urlparse.urljoin(i_rule.href, u), ignoreImportRules=True) rules = (rule for rule in i_rule.styleSheet if rule.type == rule.STYLE_RULE) gens.append(rules) rules = (rule for rule in sheet if rule.type == rule.STYLE_RULE) if gens: import itertools gens.append(rules) rules = itertools.chain(*gens) for rule in rules: for selector in rule.selectorList: #log(0, 'SELECTOR', selector.selectorText) # TODO: make this a callback to be able to use other stuff than lxml if ':' in selector.selectorText: continue #Ignore pseudo:classes because we can't use them, plus the match when we don't want them to on bs4 matching = soup.select(selector.selectorText) for element in matching: ID = id(element) if ID not in view: # add initial empty style declatation view[ID] = (element, cssutils.css.CSSStyleDeclaration() ) # @UndefinedVariable specificities[ID] = {} for p in rule.style: # update style declaration if p not in view[ID][1]: # setProperty needs a new Property object and # MUST NOT reuse the existing Property # which would be the same for all elements! # see Issue #23 view[ID][1].setProperty(p.name, p.value, p.priority) specificities[ID][p.name] = selector.specificity #log(2, view[element].getProperty('color')) else: #log(2, view[element].getProperty('color')) sameprio = ( p.priority == view[ID][1].getPropertyPriority( p.name)) if not sameprio and bool(p.priority) or ( sameprio and selector.specificity >= specificities[ID][p.name]): # later, more specific or higher prio view[ID][1].setProperty(p.name, p.value, p.priority) return view