def run(self): # Step 0: ensure that the document_root and base_path variables are # set. If the file that's being processed was inside a source that has # either one or both not set, then this processor can't run. if self.document_root is None or self.base_path is None: raise DocumentRootAndBasePathRequiredException # We don't rename the file, so we can use the default output file. parser = CSSParser(log=None, loglevel=logging.CRITICAL) sheet = parser.parseFile(self.input_file) # Step 1: ensure the file has URLs. If it doesn't, we can stop the # processing. url_count = 0 for url in getUrls(sheet): url_count += 1 break if url_count == 0: return self.input_file # Step 2: resolve the relative URLs to absolute paths. replaceUrls(sheet, self.resolveToAbsolutePath) # Step 3: verify that each of these files has been synced. synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB) self.dbcon = sqlite3.connect(synced_files_db) self.dbcon.text_factory = unicode # This is the default, but we set it explicitly, just to be sure. self.dbcur = self.dbcon.cursor() all_synced = True for urlstring in getUrls(sheet): # Skip absolute URLs. if urlstring.startswith("http://") or urlstring.startswith("https://"): continue # Skip broken references in the CSS file. This would otherwise # prevent this CSS file from ever passing through this processor. if not os.path.exists(urlstring): continue # Get the CDN URL for the given absolute path. self.dbcur.execute("SELECT url FROM synced_files WHERE input_file=?", (urlstring,)) result = self.dbcur.fetchone() if result == None: raise RequestToRequeueException( "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server) ) else: cdn_url = result[0] # Step 4: resolve the absolute paths to CDN URLs. replaceUrls(sheet, self.resolveToCDNURL) # Step 5: write the updated CSS to the output file. f = open(self.output_file, "w") f.write(sheet.cssText) f.close() return self.output_file
def validate_css(string, generate_https_urls): p = CSSParser(raiseExceptions=True) if not string or only_whitespace.match(string): return ("", ValidationReport()) report = ValidationReport(string) # avoid a very expensive parse max_size_kb = 100 if len(string) > max_size_kb * 1024: report.append(ValidationError((msgs["too_big"] % dict(max_size=max_size_kb)))) return ("", report) if "\\" in string: report.append(ValidationError(_("if you need backslashes, you're doing it wrong"))) try: parsed = p.parseString(string) except DOMException, e: # yuck; xml.dom.DOMException can't give us line-information # directly, so we have to parse its error message string to # get it line = None line_match = error_message_extract_re.match(e.message) if line_match: line = line_match.group(1) if line: line = int(line) error_message = msgs["syntax_error"] % dict(syntaxerror=e.message) report.append(ValidationError(error_message, e, line)) return (None, report)
def beautify_text(raw, syntax): from lxml import etree from calibre.ebooks.oeb.polish.parsing import parse from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree from calibre.ebooks.chardet import strip_encoding_declarations if syntax == 'xml': root = etree.fromstring(strip_encoding_declarations(raw)) pretty_xml_tree(root) elif syntax == 'css': import logging from calibre.ebooks.oeb.base import serialize, _css_logger from calibre.ebooks.oeb.polish.utils import setup_cssutils_serialization from cssutils import CSSParser, log setup_cssutils_serialization(tprefs['editor_tab_stop_width']) log.setLevel(logging.WARN) log.raiseExceptions = False parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(raw, href='<string>', validate=False) return serialize(data, 'text/css') else: root = parse(raw, line_numbers=False) pretty_html_tree(None, root) return etree.tostring(root, encoding=unicode)
def create_importer(page): importer = Importer(page=page, style='') resp = urlfetch.fetch(page.url, deadline=10) if resp.status_code == 200: soup = BeautifulSoup(resp.content) parser = CSSParser() for tag in soup.findAll(re.compile(r'^(link|style)$')): if tag.name == 'link': if tag.get('href', None) and tag.get('rel', 'stylesheet').lower() == 'stylesheet': url = urljoin(page.url, tag['href']) if urlparse(url).netloc != urlparse(request.url).netloc: importer.urls.append(url) elif tag.name == 'style': media = tag.get('media', None) sheet = parser.parseString(''.join(tag.contents).strip('\n'), href=url) style = sheet.cssText if media: style = '@media %s {\n%s\n}' % (media, style) style = '/* Imported directly from %s */\n%s\n' % (page.url, style) importer.style += style # Patch around AppEngine's frame inspection del parser importer.put() queue_import(page)
def do_import(): page = Page.get(request.form.get('page_key', '')) if not page or page.import_state != IMPORTING: return 'NO_IMPORTER' # We're done importer = Importer.gql('WHERE page=:1', page.key()).get() if not importer: # This requires a request to fetch the page and parse the URLs. # It also enqueues the next run. create_importer(page) return 'CREATED' if importer.urls: url = importer.urls.pop(0) parser = None try: resp = urlfetch.fetch(url, deadline=10) if resp.status_code == 200: parser = CSSParser() sheet = parser.parseString(resp.content, href=url) style = sheet.cssText importer.style += '\n\n/* Imported from %s */\n%s' % (url, style) else: raise Exception('Error fetching %s' % url) except Exception, e: import traceback importer.errors.append('Error importing %s' % url) logging.error('Error importing for Page %s from %s:\n%s\n%s', page.key().id(), url, e, traceback.format_exc()) finally:
def validate_css(string): p = CSSParser(raiseExceptions = True) if not string or only_whitespace.match(string): return ('',ValidationReport()) report = ValidationReport(string) # avoid a very expensive parse max_size_kb = 100; if len(string) > max_size_kb * 1024: report.append(ValidationError((msgs['too_big'] % dict (max_size = max_size_kb)))) return (string, report) try: parsed = p.parseString(string) except DOMException,e: # yuck; xml.dom.DOMException can't give us line-information # directly, so we have to parse its error message string to # get it line = None line_match = error_message_extract_re.match(e.message) if line_match: line = line_match.group(1) if line: line = int(line) error_message= (msgs['syntax_error'] % dict(syntaxerror = e.message)) report.append(ValidationError(error_message,e,line)) return (None,report)
def test_finish(self): """ L{StylesheetRewritingRequestWrapper.finish} causes all written bytes to be translated with C{_replace} written to the wrapped request. """ stylesheetFormat = """ .foo { background-image: url(%s) } """ originalStylesheet = stylesheetFormat % ("/Foo/bar",) expectedStylesheet = stylesheetFormat % ("/bar/Foo/bar",) request = FakeRequest() roots = {request: URL.fromString('/bar/')} wrapper = website.StylesheetRewritingRequestWrapper( request, [], roots.get) wrapper.write(originalStylesheet) wrapper.finish() # Parse and serialize both versions to normalize whitespace so we can # make a comparison. parser = CSSParser() self.assertEqual( parser.parseString(request.accumulator).cssText, parser.parseString(expectedStylesheet).cssText)
def main(): css = u''' /* some umlauts äöü and EURO sign € */ a:before { content: "ä"; }''' p = CSSParser() sheet = p.parseString(css) print """cssText in different encodings, depending on the console some chars may look broken but are actually not""" print sheet.encoding = 'ascii' print sheet.cssText print sheet.encoding = 'iso-8859-1' print sheet.cssText print sheet.encoding = 'iso-8859-15' print sheet.cssText print sheet.encoding = 'utf-8' print sheet.cssText print # results in default UTF-8 encoding without @charset rule sheet.encoding = None print sheet.cssText
def parse_css(self, data, fname): from cssutils import CSSParser, log log.setLevel(logging.WARN) log.raiseExceptions = False data = self.decode(data) data = self.css_preprocessor(data) parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) data = parser.parseString(data, href=fname, validate=False) return data
def finish(self): """ Parse the buffered response body, rewrite its URLs, write the result to the wrapped request, and finish the wrapped request. """ stylesheet = ''.join(self._buffer) parser = CSSParser() css = parser.parseString(stylesheet) css.replaceUrls(self._replace) self.request.write(css.cssText) return self.request.finish()
def normalize_filter_css(props): import logging ans = set() p = CSSParser(loglevel=logging.CRITICAL, validate=False) for prop in props: n = normalizers.get(prop, None) ans.add(prop) if n is not None and prop in SHORTHAND_DEFAULTS: dec = p.parseStyle('%s: %s' % (prop, SHORTHAND_DEFAULTS[prop])) cssvalue = dec.getPropertyCSSValue(dec.item(0)) ans |= set(n(prop, cssvalue)) return ans
def _apply_to_style_uri(style_text, func): dirty = False parser = CSSParser().parseStyle(style_text) for prop in parser.getProperties(all=True): for value in prop.propertyValue: if value.type == 'URI': old_uri = value.uri new_uri = func(old_uri, element=value) if new_uri != old_uri: dirty = True value.uri = new_uri if dirty: return to_unicode(parser.cssText, 'utf-8') else: return style_text
def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details')
def parse_css(self, data, fname='<string>', is_declaration=False): from cssutils import CSSParser, log log.setLevel(logging.WARN) log.raiseExceptions = False if isinstance(data, bytes): data = self.decode(data) if not self.tweak_mode: data = self.css_preprocessor(data) parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) if is_declaration: data = parser.parseStyle(data, validate=False) else: data = parser.parseString(data, href=fname, validate=False) return data
class Parser: def __init__(self): self.css_parser = CSSParser() def get_colors_from_file(self, f): sheet = self.css_parser.parseFile(f, 'utf-8') my_dict = {} for rule in sheet: if rule.type == rule.STYLE_RULE: for property in rule.style: if property.name == 'color': key = property.value if key in my_dict: my_dict[key] += 1 else: my_dict[key] = 1 return my_dict def read_all_css_files_in_dir(self): l = [] for filename in glob.glob('*.css'): d = self.get_colors_from_file(filename) l.append(d) return l
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None): if log_level is None: import logging log_level = logging.WARNING from cssutils import CSSParser, log from calibre.ebooks.oeb.base import _css_logger log.setLevel(log_level) log.raiseExceptions = False if isinstance(data, bytes): data = data.decode('utf-8') if decode is None else decode(data) if css_preprocessor is not None: data = css_preprocessor(data) parser = CSSParser(loglevel=log_level, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) if is_declaration: data = parser.parseStyle(data, validate=False) else: data = parser.parseString(data, href=fname, validate=False) return data
def extract_css(self, root, log): ans = [] for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'): ans.append(s.text) s.getparent().remove(s) head = root.xpath('//*[local-name() = "head"]') if head: head = head[0] ns = head.nsmap.get(None, '') if ns: ns = '{%s}'%ns etree.SubElement(head, ns+'link', {'type':'text/css', 'rel':'stylesheet', 'href':'odfpy.css'}) css = u'\n\n'.join(ans) parser = CSSParser(loglevel=logging.WARNING, log=_css_logger) self.css = parser.parseString(css, validate=False) with open('odfpy.css', 'wb') as f: f.write(css.encode('utf-8'))
class StyledTagWrapper: def __init__(self, el): self.el = el self.style = CSSParser().parseStyle(el.get('style')) def update(self): cssText = self.style.cssText if isinstance(cssText, str): cssText = to_unicode(cssText, 'utf-8') self.el.set('style', cssText) def uri_properties(self): for p in self.style.getProperties(all=True): for v in p.propertyValue: if v.type == 'URI': yield v
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( 'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) ): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: if media_ok(rule.media.mediaText): for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = x.makeelement('{%s}span' % XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css="", user_css=""): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == "default": self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + ".css" stylesheets = [html_css_stylesheet()] style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile["name"], profile["props"], profile["macros"]) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger("calibre.css")) self.font_face_rules = [] for elem in style_tags: if elem.tag == XHTML("style") and elem.get("type", CSS_MIME) in OEB_STYLES: text = elem.text if elem.text else u"" for x in elem: t = getattr(x, "text", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") t = getattr(x, "tail", None) if t: text += u"\n\n" + force_unicode(t, u"utf-8") if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ("utf-8", b"")) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces["h"] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == "amzn-mobi": continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn("Ignoring missing stylesheet in @import rule:", rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn("CSS @import of non-CSS file %r" % rule.href) continue stylesheets.append(sitem.data) for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif ( elem.tag == XHTML("link") and elem.get("href") and elem.get("rel", "stylesheet").lower() == "stylesheet" and elem.get("type", CSS_MIME).lower() in OEB_STYLES ): href = urlnormalize(elem.attrib["href"]) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn("Stylesheet %r referenced by file %r not in manifest" % (path, item.href)) continue if not hasattr(sitem.data, "cssRules"): self.logger.warn("Stylesheet %r referenced by file %r is not CSS" % (path, item.href)) continue stylesheets.append(sitem.data) csses = {"extra_css": extra_css, "user_css": user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces["h"] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception("Failed to parse %s, ignoring." % w) self.logger.debug("Bad css: ") self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({"all", "screen", "amzn-kf8"}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index == 0)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index == 0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur":(first-letter|first-line|link|hover|visited|active|focus|before|after)", re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), "") selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == "first-letter" and getattr(self.oeb, "plumber_output_format", "").lower() == u"mobi": # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {"P", "Z"}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u"".join(punctuation_chars) + (text[0] if text else u"") span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, "//h:*[@style]"): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r"[0-9.]+$") for elem in xpath(tree, "//h:img[@width or @height]"): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get("width", "auto") != "auto" or style._style.get("height", "auto") != "auto" if not is_styled: # Update img style dimension using width and height upd = {} for prop in ("width", "height"): val = elem.get(prop, "").strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += "px" upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = XHTML_CSS_NAMESPACE + text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') selector = get_css_selector(text) matches = selector(tree, self.logger) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: if not unicodedata.category(text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details') def log_exception(self, *args): orig = self.log.filter_level try: self.log.filter_level = self.log.DEBUG self.log.exception(*args) finally: self.log.filter_level = orig def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.log_exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log_exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] import tinycss parser = tinycss.make_full_parser() for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue try: tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value except Exception: self.log.warn('Failed to parse @font-family src: %s' % src) continue for token in tokens: if token.type == 'URI': uv = token.value if uv: sn = self.href_to_name(uv, '@font-face rule') if sn is not None: rule['src'] = sn break else: self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src) continue normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception('Unknown error occurred while reading pseudo element font usage') font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
def __init__(self, tree, path, oeb, profile, extra_css='', user_css='', change_justification='left'): assert profile is not None # XXX str/bytes hackfix if isinstance(path, bytes): decoded_path = path.decode('utf-8') else: decoded_path = path self.oeb = oeb self.profile = profile self.change_justification = change_justification item = oeb.manifest.hrefs[path] basename = os.path.basename(decoded_path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else '' for x in elem: t = getattr(x, 'text', None) if t: text += '\n\n' + force_unicode(t, 'utf-8') t = getattr(x, 'tail', None) if t: text += '\n\n' + force_unicode(t, 'utf-8') if text: text = XHTML_CSS_NAMESPACE + elem.text text = oeb.css_preprocessor(text) stylesheet = parser.parseString(text, href=cssname) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: logging.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): logging.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in list(csses.items()): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: logging.exception('Failed to parse %s, ignoring.'%w) logging.debug('Bad css: ') logging.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 # XXX had to fix crash about unsortable type, so that's why we only sort by first item of tuple rules.sort(key=lambda tup: tup[:1]) self.rules = rules self._styles = {} class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE) capital_sel_pat = re.compile(r'h|[A-Z]+') for _, _, cssdict, text, _ in rules: fl = ':first-letter' in text if fl: text = text.replace(':first-letter', '') try: selector = CSSSelector(text) except (AssertionError, ExpressionError, etree.XPathSyntaxError, NameError, # thrown on OS X instead of SelectorSyntaxError SelectorSyntaxError): continue try: matches = selector(tree) except etree.XPathEvalError: continue if not matches: ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text) if ntext != text: logging.warn('Transformed CSS selector' + text + 'to' + ntext) selector = CSSSelector(ntext) matches = selector(tree) if not matches and class_sel_pat.match(text) and text.lower() != text: found = False ltext = text.lower() for x in tree.xpath('//*[@class]'): if ltext.endswith('.'+x.get('class').lower()): matches.append(x) found = True if found: logging.warn('Ignoring case mismatches for CSS selector: %s in %s'%(text, item.href)) if fl: from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = str(x.text) while text: if not unicodedata.category(text[0]).startswith('P'): break punctuation_chars.append(text[0]) text = text[1:] special_text = ''.join(punctuation_chars) + \ (text[0] if text else '') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr() num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger("calibre.css")) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception("Failed to gather statistics from book, see log for details") def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.logger.exception("Rendering failed") self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error("Failed to render document: %s" % self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log.exception("Failed to collect font stats from: %s" % self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith("file://"): self.log.warn("Non-local URI in", warn_name, ":", href, "ignoring") return None src = href[len("file://") :] if iswindows and len(src) > 2 and (src[0], src[2]) == ("/", ":"): src = src[1:] src = src.replace("/", os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn("Missing resource", href, "in", warn_name, "ignoring") return None return name def collect_font_stats(self): self.page.evaljs("window.font_stats.get_font_face_rules()") font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception("Unknown error occurred while reading font-face rules") # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get("font-family", None) if not ff: continue style = self.parser.parseStyle("font-family:%s" % ff, validate=False) ff = [x.value for x in style.getProperty("font-family").propertyValue] if not ff or ff[0] == "inherit": continue rule["font-family"] = frozenset(icu_lower(f) for f in ff) src = rule.get("src", None) if not src: continue style = self.parser.parseStyle("background-image:%s" % src, validate=False) src = style.getProperty("background-image").propertyValue[0].uri name = self.href_to_name(src, "@font-face rule") if name is None: continue rule["src"] = name normalize_font_properties(rule) rule["width"] = widths[rule["font-stretch"]] rule["weight"] = int(rule["font-weight"]) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule["src"]] = rule for rule in rules: if rule["src"] not in self.font_stats: self.font_stats[rule["src"]] = set() self.page.evaljs("window.font_stats.get_font_usage()") font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception("Unknown error occurred while reading font usage") exclude = {"\n", "\r", "\t"} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {"serif", "sans-serif", "monospace", "cursive", "fantasy", "sansserif", "inherit"} for font in font_usage: text = set() for t in font["text"]: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule["src"]] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get("font-family", [])] if ff and ff[0] not in bad_fonts: keys = {"font-weight", "font-style", "font-stretch", "font-family"} key = frozenset(((k, ff[0] if k == "font-family" else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k: (font[k][0] if k == "font-family" else font[k]) for k in keys}) val["text"] = set() val["text"] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs("window.font_stats.get_font_families()") font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception("Unknown error occurred while reading font families") self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for raw in font_families.iterkeys(): style = self.parser.parseStyle("font-family:" + raw, validate=False).getProperty("font-family") for x in style.propertyValue: x = x.value if x and x.lower() not in bad_fonts: fs.add(x)
def __init__(self, el): self.el = el self.style = CSSParser().parseStyle(el.get('style'))
def __init__(self): self.css_parser = CSSParser()
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details') def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.logger.exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue if src.startswith('url(') and src.endswith(')') and src[4] not in {'"', "'"}: # Quote the url otherwise cssutils fails to parse it if it has # ' or " in it src = "url('" + src[4:-1].replace("'", "\\'") + "')" style = self.parser.parseStyle('background-image:%s'%src, validate=False) src = style.getProperty('background-image').propertyValue[0].uri name = self.href_to_name(src, '@font-face rule') if name is None: continue rule['src'] = name normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for raw in font_families.iterkeys(): style = self.parser.parseStyle('font-family:' + raw, validate=False).getProperty('font-family') for x in style.propertyValue: x = x.value if x and x.lower() not in bad_fonts: fs.add(x)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: self.profile = opts.output_profile self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] head = xpath(tree, '/h:html/h:head') if head: head = head[0] else: head = [] # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in head: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS'%(path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css':extra_css, 'user_css':user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.'%w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for stylesheet in stylesheets: href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = {rule.media.item(i) for i in xrange(rule.media.length)} if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend(self.flatten_rule(subrule, href, index)) index += 1 else: rules.extend(self.flatten_rule(rule, href, index)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile(ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'\d+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text, add_namespace=True) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) stylesheet.namespaces['h'] = XHTML_NS for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if rule.media.mediaText == 'amzn-mobi': continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn( 'Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn( 'CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple( stylesheet.cssRules.rulesOfType( CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif elem.tag == XHTML('link') and elem.get('href') \ and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \ and elem.get('type', CSS_MIME).lower() in OEB_STYLES: href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = XHTML_CSS_NAMESPACE + x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheet.namespaces['h'] = XHTML_NS stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: media = { rule.media.item(i) for i in xrange(rule.media.length) } if not media.intersection({'all', 'screen', 'amzn-kf8'}): continue for subrule in rule.cssRules: rules.extend( self.flatten_rule( subrule, href, index, is_user_agent_sheet=sheet_index == 0)) index += 1 else: rules.extend( self.flatten_rule( rule, href, index, is_user_agent_sheet=sheet_index == 0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile( ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) if fl is not None: text = text.replace(fl.group(), '') selector = get_css_selector(text, self.oeb.log) matches = selector(tree, self.logger) if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr(self.oeb, 'plumber_output_format', '').lower() == u'mobi': # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter(): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)