def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None): if log_level is None: import logging log_level = logging.WARNING from cssutils import CSSParser, log from calibre.ebooks.oeb.base import _css_logger log.setLevel(log_level) log.raiseExceptions = False if isinstance(data, bytes): data = data.decode('utf-8') if decode is None else decode(data) if css_preprocessor is not None: data = css_preprocessor(data) parser = CSSParser( loglevel=log_level, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) if is_declaration: data = parser.parseStyle(data, validate=False) else: data = parser.parseString(data, href=fname, validate=False) return data
def normalize_filter_css(props): import logging ans = set() p = CSSParser(loglevel=logging.CRITICAL, validate=False) for prop in props: n = normalizers.get(prop, None) ans.add(prop) if n is not None and prop in SHORTHAND_DEFAULTS: dec = p.parseStyle('%s: %s' % (prop, SHORTHAND_DEFAULTS[prop])) cssvalue = dec.getPropertyCSSValue(dec.item(0)) ans |= set(n(prop, cssvalue)) return ans
def parse_css(self, data, fname='<string>', is_declaration=False): from cssutils import CSSParser, log log.setLevel(logging.WARN) log.raiseExceptions = False if isinstance(data, bytes): data = self.decode(data) if not self.tweak_mode: data = self.css_preprocessor(data) parser = CSSParser(loglevel=logging.WARNING, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) if is_declaration: data = parser.parseStyle(data, validate=False) else: data = parser.parseString(data, href=fname, validate=False) return data
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None): if log_level is None: import logging log_level = logging.WARNING from cssutils import CSSParser, log from calibre.ebooks.oeb.base import _css_logger log.setLevel(log_level) log.raiseExceptions = False if isinstance(data, bytes): data = data.decode('utf-8') if decode is None else decode(data) if css_preprocessor is not None: data = css_preprocessor(data) parser = CSSParser(loglevel=log_level, # We dont care about @import rules fetcher=lambda x: (None, None), log=_css_logger) if is_declaration: data = parser.parseStyle(data, validate=False) else: data = parser.parseString(data, href=fname, validate=False) return data
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details') def log_exception(self, *args): orig = self.log.filter_level try: self.log.filter_level = self.log.DEBUG self.log.exception(*args) finally: self.log.filter_level = orig def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.log_exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log_exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] import tinycss parser = tinycss.make_full_parser() for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue try: tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value except Exception: self.log.warn('Failed to parse @font-family src: %s' % src) continue for token in tokens: if token.type == 'URI': uv = token.value if uv: sn = self.href_to_name(uv, '@font-face rule') if sn is not None: rule['src'] = sn break else: self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src) continue normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception('Unknown error occurred while reading pseudo element font usage') font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) self.capitalize_pat = regex.compile(r'[\p{L}\p{N}]', regex.VERSION1 | regex.UNICODE) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details') def log_exception(self, *args): orig = self.log.filter_level try: self.log.filter_level = self.log.DEBUG self.log.exception(*args) finally: self.log.filter_level = orig def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.log_exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log_exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] import tinycss parser = tinycss.make_full_parser() for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue try: tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value except Exception: self.log.warn('Failed to parse @font-family src: %s' % src) continue for token in tokens: if token.type == 'URI': uv = token.value if uv: sn = self.href_to_name(uv, '@font-face rule') if sn is not None: rule['src'] = sn break else: self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src) continue normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception('Unknown error occurred while reading pseudo element font usage') font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: tt = (font['text-transform'] or '').lower() if tt != 'none': if tt == 'uppercase': t = icu_upper(t) elif tt == 'lowercase': t = icu_lower(t) elif tt == 'capitalize': m = self.capitalize_pat.search(t) if m is not None: t += icu_upper(m.group()) fv = (font['font-variant'] or '').lower() if fv in {'smallcaps', 'small-caps', 'all-small-caps', 'petite-caps', 'all-petite-caps', 'unicase'}: t += icu_upper(t) # for renderers that try to fake small-caps by using small normal caps text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception('Failed to gather statistics from book, see log for details') def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.logger.exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception('Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s'%ff, validate=False) ff = [x.value for x in style.getProperty('font-family').propertyValue] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue if src.startswith('url(') and src.endswith(')') and src[4] not in {'"', "'"}: # Quote the url otherwise cssutils fails to parse it if it has # ' or " in it src = "url('" + src[4:-1].replace("'", "\\'") + "')" style = self.parser.parseStyle('background-image:%s'%src, validate=False) src = style.getProperty('background-image').propertyValue[0].uri name = self.href_to_name(src, '@font-face rule') if name is None: continue rule['src'] = name normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'} for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'} key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys}) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception('Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for raw in font_families.iterkeys(): style = self.parser.parseStyle('font-family:' + raw, validate=False).getProperty('font-family') for x in style.propertyValue: x = x.value if x and x.lower() not in bad_fonts: fs.add(x)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css')) self.first_letter_pat = regex.compile( r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception( 'Failed to gather statistics from book, see log for details') def log_exception(self, *args): orig = self.log.filter_level try: self.log.filter_level = self.log.DEBUG self.log.exception(*args) finally: self.log.filter_level = orig def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.log_exception('Rendering failed') self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error('Failed to render document: %s' % self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log_exception('Failed to collect font stats from: %s' % self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith('file://'): self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring') return None src = href[len('file://'):] if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'): src = src[1:] src = src.replace('/', os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn('Missing resource', href, 'in', warn_name, 'ignoring') return None return name def collect_font_stats(self): self.page.evaljs('window.font_stats.get_font_face_rules()') font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception( 'Unknown error occurred while reading font-face rules') # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get('font-family', None) if not ff: continue style = self.parser.parseStyle('font-family:%s' % ff, validate=False) ff = [ x.value for x in style.getProperty('font-family').propertyValue ] if not ff or ff[0] == 'inherit': continue rule['font-family'] = frozenset(icu_lower(f) for f in ff) src = rule.get('src', None) if not src: continue if src.startswith('url(') and src.endswith(')') and src[4] not in { '"', "'" }: # Quote the url otherwise cssutils fails to parse it if it has # ' or " in it src = "url('" + src[4:-1].replace("'", "\\'") + "')" style = self.parser.parseStyle('background-image:%s' % src, validate=False) src = style.getProperty('background-image').propertyValue[0].uri name = self.href_to_name(src, '@font-face rule') if name is None: continue rule['src'] = name normalize_font_properties(rule) rule['width'] = widths[rule['font-stretch']] rule['weight'] = int(rule['font-weight']) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name( self.current_item)] = rules for rule in rules: self.all_font_rules[rule['src']] = rule for rule in rules: if rule['src'] not in self.font_stats: self.font_stats[rule['src']] = set() self.page.evaljs('window.font_stats.get_font_usage()') font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception('Unknown error occurred while reading font usage') self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()') pseudo_element_font_usage = self.page.bridge_value if not isinstance(pseudo_element_font_usage, list): raise Exception( 'Unknown error occurred while reading pseudo element font usage' ) font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser) exclude = {'\n', '\r', '\t'} self.font_usage_map[self.container.abspath_to_name( self.current_item)] = fu = defaultdict(dict) bad_fonts = { 'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit' } for font in font_usage: text = set() for t in font['text']: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule['src']] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get('font-family', [])] if ff and ff[0] not in bad_fonts: keys = { 'font-weight', 'font-style', 'font-stretch', 'font-family' } key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({ k: (font[k][0] if k == 'font-family' else font[k]) for k in keys }) val['text'] = set() val['text'] |= text self.font_usage_map[self.container.abspath_to_name( self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs('window.font_stats.get_font_families()') font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception( 'Unknown error occurred while reading font families') self.font_spec_map[self.container.abspath_to_name( self.current_item)] = fs = set() for font_dict, text, pseudo in pseudo_element_font_usage: font_families[font_dict['font-family']] = True for raw in font_families.iterkeys(): for x in parse_font_families(self.parser, raw): if x.lower() not in bad_fonts: fs.add(x)
class StatsCollector(object): def __init__(self, container, do_embed=False): self.container = container self.log = self.logger = container.log self.do_embed = do_embed must_use_qt() self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger("calibre.css")) self.loop = QEventLoop() self.view = QWebView() self.page = Page(self.log) self.view.setPage(self.page) self.page.setViewportSize(QSize(1200, 1600)) self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection) self.render_queue = list(container.spine_items) self.font_stats = {} self.font_usage_map = {} self.font_spec_map = {} self.font_rule_map = {} self.all_font_rules = {} QTimer.singleShot(0, self.render_book) if self.loop.exec_() == 1: raise Exception("Failed to gather statistics from book, see log for details") def render_book(self): try: if not self.render_queue: self.loop.exit() else: self.render_next() except: self.logger.exception("Rendering failed") self.loop.exit(1) def render_next(self): item = unicode(self.render_queue.pop(0)) self.current_item = item load_html(item, self.view) def collect(self, ok): if not ok: self.log.error("Failed to render document: %s" % self.container.relpath(self.current_item)) self.loop.exit(1) return try: self.page.load_js() self.collect_font_stats() except: self.log.exception("Failed to collect font stats from: %s" % self.container.relpath(self.current_item)) self.loop.exit(1) return self.render_book() def href_to_name(self, href, warn_name): if not href.startswith("file://"): self.log.warn("Non-local URI in", warn_name, ":", href, "ignoring") return None src = href[len("file://") :] if iswindows and len(src) > 2 and (src[0], src[2]) == ("/", ":"): src = src[1:] src = src.replace("/", os.sep) src = unquote(src) name = self.container.abspath_to_name(src) if not self.container.has_name(name): self.log.warn("Missing resource", href, "in", warn_name, "ignoring") return None return name def collect_font_stats(self): self.page.evaljs("window.font_stats.get_font_face_rules()") font_face_rules = self.page.bridge_value if not isinstance(font_face_rules, list): raise Exception("Unknown error occurred while reading font-face rules") # Weed out invalid font-face rules rules = [] for rule in font_face_rules: ff = rule.get("font-family", None) if not ff: continue style = self.parser.parseStyle("font-family:%s" % ff, validate=False) ff = [x.value for x in style.getProperty("font-family").propertyValue] if not ff or ff[0] == "inherit": continue rule["font-family"] = frozenset(icu_lower(f) for f in ff) src = rule.get("src", None) if not src: continue style = self.parser.parseStyle("background-image:%s" % src, validate=False) src = style.getProperty("background-image").propertyValue[0].uri name = self.href_to_name(src, "@font-face rule") if name is None: continue rule["src"] = name normalize_font_properties(rule) rule["width"] = widths[rule["font-stretch"]] rule["weight"] = int(rule["font-weight"]) rules.append(rule) if not rules and not self.do_embed: return self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules for rule in rules: self.all_font_rules[rule["src"]] = rule for rule in rules: if rule["src"] not in self.font_stats: self.font_stats[rule["src"]] = set() self.page.evaljs("window.font_stats.get_font_usage()") font_usage = self.page.bridge_value if not isinstance(font_usage, list): raise Exception("Unknown error occurred while reading font usage") exclude = {"\n", "\r", "\t"} self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict) bad_fonts = {"serif", "sans-serif", "monospace", "cursive", "fantasy", "sansserif", "inherit"} for font in font_usage: text = set() for t in font["text"]: text |= frozenset(t) text.difference_update(exclude) if not text: continue normalize_font_properties(font) for rule in get_matching_rules(rules, font): self.font_stats[rule["src"]] |= text if self.do_embed: ff = [icu_lower(x) for x in font.get("font-family", [])] if ff and ff[0] not in bad_fonts: keys = {"font-weight", "font-style", "font-stretch", "font-family"} key = frozenset(((k, ff[0] if k == "font-family" else v) for k, v in font.iteritems() if k in keys)) val = fu[key] if not val: val.update({k: (font[k][0] if k == "font-family" else font[k]) for k in keys}) val["text"] = set() val["text"] |= text self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu) if self.do_embed: self.page.evaljs("window.font_stats.get_font_families()") font_families = self.page.bridge_value if not isinstance(font_families, dict): raise Exception("Unknown error occurred while reading font families") self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set() for raw in font_families.iterkeys(): style = self.parser.parseStyle("font-family:" + raw, validate=False).getProperty("font-family") for x in style.propertyValue: x = x.value if x and x.lower() not in bad_fonts: fs.add(x)