Esempio n. 1
0
    def run(self):
        # Step 0: ensure that the document_root and base_path variables are
        # set. If the file that's being processed was inside a source that has
        # either one or both not set, then this processor can't run.
        if self.document_root is None or self.base_path is None:
            raise DocumentRootAndBasePathRequiredException

        # We don't rename the file, so we can use the default output file.

        parser = CSSParser(log=None, loglevel=logging.CRITICAL)
        sheet = parser.parseFile(self.input_file)

        # Step 1: ensure the file has URLs. If it doesn't, we can stop the
        # processing.
        url_count = 0
        for url in getUrls(sheet):
            url_count += 1
            break
        if url_count == 0:
            return self.input_file

        # Step 2: resolve the relative URLs to absolute paths.
        replaceUrls(sheet, self.resolveToAbsolutePath)

        # Step 3: verify that each of these files has been synced.
        synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB)
        self.dbcon = sqlite3.connect(synced_files_db)
        self.dbcon.text_factory = unicode  # This is the default, but we set it explicitly, just to be sure.
        self.dbcur = self.dbcon.cursor()
        all_synced = True
        for urlstring in getUrls(sheet):
            # Skip absolute URLs.
            if urlstring.startswith("http://") or urlstring.startswith("https://"):
                continue

            # Skip broken references in the CSS file. This would otherwise
            # prevent this CSS file from ever passing through this processor.
            if not os.path.exists(urlstring):
                continue

            # Get the CDN URL for the given absolute path.
            self.dbcur.execute("SELECT url FROM synced_files WHERE input_file=?", (urlstring,))
            result = self.dbcur.fetchone()

            if result == None:
                raise RequestToRequeueException(
                    "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server)
                )
            else:
                cdn_url = result[0]

        # Step 4: resolve the absolute paths to CDN URLs.
        replaceUrls(sheet, self.resolveToCDNURL)

        # Step 5: write the updated CSS to the output file.
        f = open(self.output_file, "w")
        f.write(sheet.cssText)
        f.close()

        return self.output_file
Esempio n. 2
0
def validate_css(string, generate_https_urls):
    p = CSSParser(raiseExceptions=True)

    if not string or only_whitespace.match(string):
        return ("", ValidationReport())

    report = ValidationReport(string)

    # avoid a very expensive parse
    max_size_kb = 100
    if len(string) > max_size_kb * 1024:
        report.append(ValidationError((msgs["too_big"] % dict(max_size=max_size_kb))))
        return ("", report)

    if "\\" in string:
        report.append(ValidationError(_("if you need backslashes, you're doing it wrong")))

    try:
        parsed = p.parseString(string)
    except DOMException, e:
        # yuck; xml.dom.DOMException can't give us line-information
        # directly, so we have to parse its error message string to
        # get it
        line = None
        line_match = error_message_extract_re.match(e.message)
        if line_match:
            line = line_match.group(1)
            if line:
                line = int(line)
        error_message = msgs["syntax_error"] % dict(syntaxerror=e.message)
        report.append(ValidationError(error_message, e, line))
        return (None, report)
Esempio n. 3
0
def beautify_text(raw, syntax):
    from lxml import etree
    from calibre.ebooks.oeb.polish.parsing import parse
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree, pretty_html_tree
    from calibre.ebooks.chardet import strip_encoding_declarations
    if syntax == 'xml':
        root = etree.fromstring(strip_encoding_declarations(raw))
        pretty_xml_tree(root)
    elif syntax == 'css':
        import logging
        from calibre.ebooks.oeb.base import serialize, _css_logger
        from calibre.ebooks.oeb.polish.utils import setup_cssutils_serialization
        from cssutils import CSSParser, log
        setup_cssutils_serialization(tprefs['editor_tab_stop_width'])
        log.setLevel(logging.WARN)
        log.raiseExceptions = False
        parser = CSSParser(loglevel=logging.WARNING,
                           # We dont care about @import rules
                           fetcher=lambda x: (None, None), log=_css_logger)
        data = parser.parseString(raw, href='<string>', validate=False)
        return serialize(data, 'text/css')
    else:
        root = parse(raw, line_numbers=False)
        pretty_html_tree(None, root)
    return etree.tostring(root, encoding=unicode)
Esempio n. 4
0
def create_importer(page):
    importer = Importer(page=page, style='')
    resp = urlfetch.fetch(page.url, deadline=10)
    if resp.status_code == 200:
        soup = BeautifulSoup(resp.content)
        parser = CSSParser()
        for tag in soup.findAll(re.compile(r'^(link|style)$')):
            if tag.name == 'link':
                if tag.get('href', None) and tag.get('rel', 'stylesheet').lower() == 'stylesheet':
                    url = urljoin(page.url, tag['href'])
                    if urlparse(url).netloc != urlparse(request.url).netloc:
                        importer.urls.append(url)
            elif tag.name == 'style':
                media = tag.get('media', None)
                sheet = parser.parseString(''.join(tag.contents).strip('\n'), href=url)
                style = sheet.cssText
                if media:
                    style = '@media %s {\n%s\n}' % (media, style)
                style = '/* Imported directly from %s */\n%s\n' % (page.url, style)
                importer.style += style
        # Patch around AppEngine's frame inspection
        del parser

        importer.put()
        queue_import(page)
Esempio n. 5
0
def do_import():
    page = Page.get(request.form.get('page_key', ''))
    if not page or page.import_state != IMPORTING:
        return 'NO_IMPORTER' # We're done
    importer = Importer.gql('WHERE page=:1', page.key()).get()
    if not importer:
        # This requires a request to fetch the page and parse the URLs.
        # It also enqueues the next run.
        create_importer(page)
        return 'CREATED'
    if importer.urls:
        url = importer.urls.pop(0)
        parser = None
        try:
            resp = urlfetch.fetch(url, deadline=10)
            if resp.status_code == 200:
                parser = CSSParser()
                sheet = parser.parseString(resp.content, href=url)
                style = sheet.cssText
                importer.style += '\n\n/* Imported from %s */\n%s' % (url, style)
            else:
                raise Exception('Error fetching %s' % url)
        except Exception, e:
            import traceback
            importer.errors.append('Error importing %s' % url)
            logging.error('Error importing for Page %s from %s:\n%s\n%s', page.key().id(), url, e, traceback.format_exc())
        finally:
Esempio n. 6
0
def validate_css(string):
    p = CSSParser(raiseExceptions = True)

    if not string or only_whitespace.match(string):
        return ('',ValidationReport())

    report = ValidationReport(string)
    
    # avoid a very expensive parse
    max_size_kb = 100;
    if len(string) > max_size_kb * 1024:
        report.append(ValidationError((msgs['too_big']
                                       % dict (max_size = max_size_kb))))
        return (string, report)

    try:
        parsed = p.parseString(string)
    except DOMException,e:
        # yuck; xml.dom.DOMException can't give us line-information
        # directly, so we have to parse its error message string to
        # get it
        line = None
        line_match = error_message_extract_re.match(e.message)
        if line_match:
            line = line_match.group(1)
            if line:
                line = int(line)
        error_message=  (msgs['syntax_error']
                         % dict(syntaxerror = e.message))
        report.append(ValidationError(error_message,e,line))
        return (None,report)
Esempio n. 7
0
    def test_finish(self):
        """
        L{StylesheetRewritingRequestWrapper.finish} causes all written bytes to
        be translated with C{_replace} written to the wrapped request.
        """
        stylesheetFormat = """
            .foo {
                background-image: url(%s)
            }
        """
        originalStylesheet = stylesheetFormat % ("/Foo/bar",)
        expectedStylesheet = stylesheetFormat % ("/bar/Foo/bar",)

        request = FakeRequest()
        roots = {request: URL.fromString('/bar/')}
        wrapper = website.StylesheetRewritingRequestWrapper(
            request, [], roots.get)
        wrapper.write(originalStylesheet)
        wrapper.finish()
        # Parse and serialize both versions to normalize whitespace so we can
        # make a comparison.
        parser = CSSParser()
        self.assertEqual(
            parser.parseString(request.accumulator).cssText,
            parser.parseString(expectedStylesheet).cssText)
def main():
    css = u'''
    /* some umlauts äöü and EURO sign € */
    a:before {
       content: "ä";
        }'''

    p = CSSParser()
    sheet = p.parseString(css)
    
    print """cssText in different encodings, depending on the console some
     chars may look broken but are actually not"""
    print 
    
    sheet.encoding = 'ascii'
    print sheet.cssText
    print
    
    sheet.encoding = 'iso-8859-1'
    print sheet.cssText
    print
    
    sheet.encoding = 'iso-8859-15'
    print sheet.cssText
    print
    
    sheet.encoding = 'utf-8'
    print sheet.cssText
    print
    
    # results in default UTF-8 encoding without @charset rule
    sheet.encoding = None
    print sheet.cssText
Esempio n. 9
0
 def parse_css(self, data, fname):
     from cssutils import CSSParser, log
     log.setLevel(logging.WARN)
     log.raiseExceptions = False
     data = self.decode(data)
     data = self.css_preprocessor(data)
     parser = CSSParser(loglevel=logging.WARNING,
                        # We dont care about @import rules
                        fetcher=lambda x: (None, None), log=_css_logger)
     data = parser.parseString(data, href=fname, validate=False)
     return data
Esempio n. 10
0
 def finish(self):
     """
     Parse the buffered response body, rewrite its URLs, write the result to
     the wrapped request, and finish the wrapped request.
     """
     stylesheet = ''.join(self._buffer)
     parser = CSSParser()
     css = parser.parseString(stylesheet)
     css.replaceUrls(self._replace)
     self.request.write(css.cssText)
     return self.request.finish()
Esempio n. 11
0
def normalize_filter_css(props):
    import logging
    ans = set()
    p = CSSParser(loglevel=logging.CRITICAL, validate=False)
    for prop in props:
        n = normalizers.get(prop, None)
        ans.add(prop)
        if n is not None and prop in SHORTHAND_DEFAULTS:
            dec = p.parseStyle('%s: %s' % (prop, SHORTHAND_DEFAULTS[prop]))
            cssvalue = dec.getPropertyCSSValue(dec.item(0))
            ans |= set(n(prop, cssvalue))
    return ans
Esempio n. 12
0
 def _apply_to_style_uri(style_text, func):
     dirty = False
     parser = CSSParser().parseStyle(style_text)
     for prop in parser.getProperties(all=True):
         for value in prop.propertyValue:
             if value.type == 'URI':
                 old_uri = value.uri
                 new_uri = func(old_uri, element=value)
                 if new_uri != old_uri:
                     dirty = True
                     value.uri = new_uri
     if dirty:
         return to_unicode(parser.cssText, 'utf-8')
     else:
         return style_text
Esempio n. 13
0
    def __init__(self, container, do_embed=False):
        self.container = container
        self.log = self.logger = container.log
        self.do_embed = do_embed
        must_use_qt()
        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css'))
        self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)

        self.loop = QEventLoop()
        self.view = QWebView()
        self.page = Page(self.log)
        self.view.setPage(self.page)
        self.page.setViewportSize(QSize(1200, 1600))

        self.view.loadFinished.connect(self.collect,
                type=Qt.QueuedConnection)

        self.render_queue = list(container.spine_items)
        self.font_stats = {}
        self.font_usage_map = {}
        self.font_spec_map = {}
        self.font_rule_map = {}
        self.all_font_rules = {}

        QTimer.singleShot(0, self.render_book)

        if self.loop.exec_() == 1:
            raise Exception('Failed to gather statistics from book, see log for details')
Esempio n. 14
0
 def parse_css(self, data, fname='<string>', is_declaration=False):
     from cssutils import CSSParser, log
     log.setLevel(logging.WARN)
     log.raiseExceptions = False
     if isinstance(data, bytes):
         data = self.decode(data)
     if not self.tweak_mode:
         data = self.css_preprocessor(data)
     parser = CSSParser(loglevel=logging.WARNING,
                        # We dont care about @import rules
                        fetcher=lambda x: (None, None), log=_css_logger)
     if is_declaration:
         data = parser.parseStyle(data, validate=False)
     else:
         data = parser.parseString(data, href=fname, validate=False)
     return data
Esempio n. 15
0
class Parser:
    def __init__(self):
        self.css_parser = CSSParser()

    def get_colors_from_file(self, f):
        sheet = self.css_parser.parseFile(f, 'utf-8')
        my_dict = {}
        for rule in sheet:
            if rule.type == rule.STYLE_RULE:
                for property in rule.style:
                    if property.name == 'color':
                        key = property.value
                        if key in my_dict:
                            my_dict[key] += 1
                        else:
                            my_dict[key] = 1

        return my_dict

    def read_all_css_files_in_dir(self):
        l = []
        for filename in glob.glob('*.css'):
            d = self.get_colors_from_file(filename)
            l.append(d)

        return l
Esempio n. 16
0
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
    if log_level is None:
        import logging
        log_level = logging.WARNING
    from cssutils import CSSParser, log
    from calibre.ebooks.oeb.base import _css_logger
    log.setLevel(log_level)
    log.raiseExceptions = False
    if isinstance(data, bytes):
        data = data.decode('utf-8') if decode is None else decode(data)
    if css_preprocessor is not None:
        data = css_preprocessor(data)
    parser = CSSParser(loglevel=log_level,
                        # We dont care about @import rules
                        fetcher=lambda x: (None, None), log=_css_logger)
    if is_declaration:
        data = parser.parseStyle(data, validate=False)
    else:
        data = parser.parseString(data, href=fname, validate=False)
    return data
Esempio n. 17
0
    def extract_css(self, root, log):
        ans = []
        for s in root.xpath('//*[local-name() = "style" and @type="text/css"]'):
            ans.append(s.text)
            s.getparent().remove(s)

        head = root.xpath('//*[local-name() = "head"]')
        if head:
            head = head[0]
            ns = head.nsmap.get(None, '')
            if ns:
                ns = '{%s}'%ns
            etree.SubElement(head, ns+'link', {'type':'text/css',
                'rel':'stylesheet', 'href':'odfpy.css'})

        css = u'\n\n'.join(ans)
        parser = CSSParser(loglevel=logging.WARNING,
                            log=_css_logger)
        self.css = parser.parseString(css, validate=False)

        with open('odfpy.css', 'wb') as f:
            f.write(css.encode('utf-8'))
Esempio n. 18
0
class StyledTagWrapper:

    def __init__(self, el):
        self.el = el
        self.style = CSSParser().parseStyle(el.get('style'))

    def update(self):
        cssText = self.style.cssText
        if isinstance(cssText, str):
            cssText = to_unicode(cssText, 'utf-8')
        self.el.set('style', cssText)

    def uri_properties(self):
        for p in self.style.getProperties(all=True):
            for v in p.propertyValue:
                if v.type == 'URI':
                    yield v
Esempio n. 19
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css='', base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
                    'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
                    'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
                ):
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    if media_ok(rule.media.mediaText):
                        for subrule in rule.cssRules:
                            rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index==0))
                            index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index==0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error('Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() in {u'mobi', u'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = x.makeelement('{%s}span' % XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Esempio n. 20
0
    def __init__(self, tree, path, oeb, opts, profile=None, extra_css="", user_css=""):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles

            for x in output_profiles():
                if x.short_name == "default":
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + ".css"
        stylesheets = [html_css_stylesheet()]
        style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile["name"], profile["props"], profile["macros"])

        parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger("calibre.css"))
        self.font_face_rules = []
        for elem in style_tags:
            if elem.tag == XHTML("style") and elem.get("type", CSS_MIME) in OEB_STYLES:
                text = elem.text if elem.text else u""
                for x in elem:
                    t = getattr(x, "text", None)
                    if t:
                        text += u"\n\n" + force_unicode(t, u"utf-8")
                    t = getattr(x, "tail", None)
                    if t:
                        text += u"\n\n" + force_unicode(t, u"utf-8")
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ("utf-8", b""))
                    stylesheet = parser.parseString(text, href=cssname, validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces["h"] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == "amzn-mobi":
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn("Ignoring missing stylesheet in @import rule:", rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn("CSS @import of non-CSS file %r" % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    for rule in tuple(stylesheet.cssRules.rulesOfType(CSSRule.PAGE_RULE)):
                        stylesheet.cssRules.remove(rule)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref, ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (
                elem.tag == XHTML("link")
                and elem.get("href")
                and elem.get("rel", "stylesheet").lower() == "stylesheet"
                and elem.get("type", CSS_MIME).lower() in OEB_STYLES
            ):
                href = urlnormalize(elem.attrib["href"])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn("Stylesheet %r referenced by file %r not in manifest" % (path, item.href))
                    continue
                if not hasattr(sitem.data, "cssRules"):
                    self.logger.warn("Stylesheet %r referenced by file %r is not CSS" % (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {"extra_css": extra_css, "user_css": user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname, validate=False)
                    stylesheet.namespaces["h"] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception("Failed to parse %s, ignoring." % w)
                    self.logger.debug("Bad css: ")
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in xrange(rule.media.length)}
                    if not media.intersection({"all", "screen", "amzn-kf8"}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index, is_user_agent_sheet=sheet_index == 0))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index, is_user_agent_sheet=sheet_index == 0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur":(first-letter|first-line|link|hover|visited|active|focus|before|after)", re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), "")
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == "first-letter" and getattr(self.oeb, "plumber_output_format", "").lower() == u"mobi":
                    # Fake first-letter
                    from lxml.builder import ElementMaker

                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {"P", "Z"}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u"".join(punctuation_chars) + (text[0] if text else u"")
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, "//h:*[@style]"):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r"[0-9.]+$")
        for elem in xpath(tree, "//h:img[@width or @height]"):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get("width", "auto") != "auto" or style._style.get("height", "auto") != "auto"
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ("width", "height"):
                    val = elem.get(prop, "").strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += "px"
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Esempio n. 21
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = XHTML_CSS_NAMESPACE + text
                    text = oeb.css_preprocessor(text)
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                rules.extend(self.flatten_rule(rule, href, index))
                index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        for _, _, cssdict, text, _ in rules:
            fl = ':first-letter' in text
            if fl:
                text = text.replace(':first-letter', '')
            selector = get_css_selector(text)
            matches = selector(tree, self.logger)
            if fl:
                from lxml.builder import ElementMaker
                E = ElementMaker(namespace=XHTML_NS)
                for elem in matches:
                    for x in elem.iter():
                        if x.text:
                            punctuation_chars = []
                            text = unicode(x.text)
                            while text:
                                if not unicodedata.category(text[0]).startswith('P'):
                                    break
                                punctuation_chars.append(text[0])
                                text = text[1:]

                            special_text = u''.join(punctuation_chars) + \
                                    (text[0] if text else u'')
                            span = E.span(special_text)
                            span.tail = text[1:]
                            x.text = None
                            x.insert(0, span)
                            self.style(span)._update_cssdict(cssdict)
                            break
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Esempio n. 22
0
class StatsCollector(object):

    def __init__(self, container, do_embed=False):
        self.container = container
        self.log = self.logger = container.log
        self.do_embed = do_embed
        must_use_qt()
        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css'))
        self.first_letter_pat = regex.compile(r'^[\p{Ps}\p{Ps}\p{Pe}\p{Pi}\p{Pf}\p{Po}]+', regex.VERSION1 | regex.UNICODE)

        self.loop = QEventLoop()
        self.view = QWebView()
        self.page = Page(self.log)
        self.view.setPage(self.page)
        self.page.setViewportSize(QSize(1200, 1600))

        self.view.loadFinished.connect(self.collect,
                type=Qt.QueuedConnection)

        self.render_queue = list(container.spine_items)
        self.font_stats = {}
        self.font_usage_map = {}
        self.font_spec_map = {}
        self.font_rule_map = {}
        self.all_font_rules = {}

        QTimer.singleShot(0, self.render_book)

        if self.loop.exec_() == 1:
            raise Exception('Failed to gather statistics from book, see log for details')

    def log_exception(self, *args):
        orig = self.log.filter_level
        try:
            self.log.filter_level = self.log.DEBUG
            self.log.exception(*args)
        finally:
            self.log.filter_level = orig

    def render_book(self):
        try:
            if not self.render_queue:
                self.loop.exit()
            else:
                self.render_next()
        except:
            self.log_exception('Rendering failed')
            self.loop.exit(1)

    def render_next(self):
        item = unicode(self.render_queue.pop(0))
        self.current_item = item
        load_html(item, self.view)

    def collect(self, ok):
        if not ok:
            self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item))
            self.loop.exit(1)
            return
        try:
            self.page.load_js()
            self.collect_font_stats()
        except:
            self.log_exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
            self.loop.exit(1)
            return

        self.render_book()

    def href_to_name(self, href, warn_name):
        if not href.startswith('file://'):
            self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring')
            return None
        src = href[len('file://'):]
        if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'):
            src = src[1:]
        src = src.replace('/', os.sep)
        src = unquote(src)
        name = self.container.abspath_to_name(src)
        if not self.container.has_name(name):
            self.log.warn('Missing resource', href, 'in', warn_name,
                          'ignoring')
            return None
        return name

    def collect_font_stats(self):
        self.page.evaljs('window.font_stats.get_font_face_rules()')
        font_face_rules = self.page.bridge_value
        if not isinstance(font_face_rules, list):
            raise Exception('Unknown error occurred while reading font-face rules')

        # Weed out invalid font-face rules
        rules = []
        import tinycss
        parser = tinycss.make_full_parser()
        for rule in font_face_rules:
            ff = rule.get('font-family', None)
            if not ff:
                continue
            style = self.parser.parseStyle('font-family:%s'%ff, validate=False)
            ff = [x.value for x in
                  style.getProperty('font-family').propertyValue]
            if not ff or ff[0] == 'inherit':
                continue
            rule['font-family'] = frozenset(icu_lower(f) for f in ff)
            src = rule.get('src', None)
            if not src:
                continue
            try:
                tokens = parser.parse_stylesheet('@font-face { src: %s }' % src).rules[0].declarations[0].value
            except Exception:
                self.log.warn('Failed to parse @font-family src: %s' % src)
                continue
            for token in tokens:
                if token.type == 'URI':
                    uv = token.value
                    if uv:
                        sn = self.href_to_name(uv, '@font-face rule')
                        if sn is not None:
                            rule['src'] = sn
                            break
            else:
                self.log.warn('The @font-face rule refers to a font file that does not exist in the book: %s' % src)
                continue
            normalize_font_properties(rule)
            rule['width'] = widths[rule['font-stretch']]
            rule['weight'] = int(rule['font-weight'])
            rules.append(rule)

        if not rules and not self.do_embed:
            return

        self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules
        for rule in rules:
            self.all_font_rules[rule['src']] = rule

        for rule in rules:
            if rule['src'] not in self.font_stats:
                self.font_stats[rule['src']] = set()

        self.page.evaljs('window.font_stats.get_font_usage()')
        font_usage = self.page.bridge_value
        if not isinstance(font_usage, list):
            raise Exception('Unknown error occurred while reading font usage')
        self.page.evaljs('window.font_stats.get_pseudo_element_font_usage()')
        pseudo_element_font_usage = self.page.bridge_value
        if not isinstance(pseudo_element_font_usage, list):
            raise Exception('Unknown error occurred while reading pseudo element font usage')
        font_usage += get_pseudo_element_font_usage(pseudo_element_font_usage, self.first_letter_pat, self.parser)
        exclude = {'\n', '\r', '\t'}
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict)
        bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'}
        for font in font_usage:
            text = set()
            for t in font['text']:
                text |= frozenset(t)
            text.difference_update(exclude)
            if not text:
                continue
            normalize_font_properties(font)
            for rule in get_matching_rules(rules, font):
                self.font_stats[rule['src']] |= text
            if self.do_embed:
                ff = [icu_lower(x) for x in font.get('font-family', [])]
                if ff and ff[0] not in bad_fonts:
                    keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}
                    key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys))
                    val = fu[key]
                    if not val:
                        val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys})
                        val['text'] = set()
                    val['text'] |= text
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu)

        if self.do_embed:
            self.page.evaljs('window.font_stats.get_font_families()')
            font_families = self.page.bridge_value
            if not isinstance(font_families, dict):
                raise Exception('Unknown error occurred while reading font families')
            self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set()
            for font_dict, text, pseudo in pseudo_element_font_usage:
                font_families[font_dict['font-family']] = True
            for raw in font_families.iterkeys():
                for x in parse_font_families(self.parser, raw):
                    if x.lower() not in bad_fonts:
                        fs.add(x)
Esempio n. 23
0
    def __init__(self, tree, path, oeb, profile, extra_css='', user_css='',
            change_justification='left'):
        assert profile is not None
        # XXX str/bytes hackfix
        if isinstance(path, bytes):
            decoded_path = path.decode('utf-8')
        else:
            decoded_path = path
        self.oeb = oeb
        self.profile = profile
        self.change_justification = change_justification
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(decoded_path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else ''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                if text:
                    text = XHTML_CSS_NAMESPACE + elem.text
                    text = oeb.css_preprocessor(text)
                    stylesheet = parser.parseString(text, href=cssname)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    logging.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    logging.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in list(csses.items()):
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    logging.exception('Failed to parse %s, ignoring.'%w)
                    logging.debug('Bad css: ')
                    logging.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                rules.extend(self.flatten_rule(rule, href, index))
                index = index + 1
        # XXX had to fix crash about unsortable type, so that's why we only sort by first item of tuple
        rules.sort(key=lambda tup: tup[:1])
        self.rules = rules
        self._styles = {}
        class_sel_pat = re.compile(r'\.[a-z]+', re.IGNORECASE)
        capital_sel_pat = re.compile(r'h|[A-Z]+')
        for _, _, cssdict, text, _ in rules:
            fl = ':first-letter' in text
            if fl:
                text = text.replace(':first-letter', '')
            try:
                selector = CSSSelector(text)
            except (AssertionError, ExpressionError, etree.XPathSyntaxError,
                    NameError, # thrown on OS X instead of SelectorSyntaxError
                    SelectorSyntaxError):
                continue
            try:
                matches = selector(tree)
            except etree.XPathEvalError:
                continue

            if not matches:
                ntext = capital_sel_pat.sub(lambda m: m.group().lower(), text)
                if ntext != text:
                    logging.warn('Transformed CSS selector' + text + 'to' + ntext)
                    selector = CSSSelector(ntext)
                    matches = selector(tree)

            if not matches and class_sel_pat.match(text) and text.lower() != text:
                found = False
                ltext = text.lower()
                for x in tree.xpath('//*[@class]'):
                    if ltext.endswith('.'+x.get('class').lower()):
                        matches.append(x)
                        found = True
                if found:
                    logging.warn('Ignoring case mismatches for CSS selector: %s in %s'%(text, item.href))
            if fl:
                from lxml.builder import ElementMaker
                E = ElementMaker(namespace=XHTML_NS)
                for elem in matches:
                    for x in elem.iter():
                        if x.text:
                            punctuation_chars = []
                            text = str(x.text)
                            while text:
                                if not unicodedata.category(text[0]).startswith('P'):
                                    break
                                punctuation_chars.append(text[0])
                                text = text[1:]

                            special_text = ''.join(punctuation_chars) + \
                                    (text[0] if text else '')
                            span = E.span(special_text)
                            span.tail = text[1:]
                            x.text = None
                            x.insert(0, span)
                            self.style(span)._update_cssdict(cssdict)
                            break
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr()
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Esempio n. 24
0
class StatsCollector(object):
    def __init__(self, container, do_embed=False):
        self.container = container
        self.log = self.logger = container.log
        self.do_embed = do_embed
        must_use_qt()
        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger("calibre.css"))

        self.loop = QEventLoop()
        self.view = QWebView()
        self.page = Page(self.log)
        self.view.setPage(self.page)
        self.page.setViewportSize(QSize(1200, 1600))

        self.view.loadFinished.connect(self.collect, type=Qt.QueuedConnection)

        self.render_queue = list(container.spine_items)
        self.font_stats = {}
        self.font_usage_map = {}
        self.font_spec_map = {}
        self.font_rule_map = {}
        self.all_font_rules = {}

        QTimer.singleShot(0, self.render_book)

        if self.loop.exec_() == 1:
            raise Exception("Failed to gather statistics from book, see log for details")

    def render_book(self):
        try:
            if not self.render_queue:
                self.loop.exit()
            else:
                self.render_next()
        except:
            self.logger.exception("Rendering failed")
            self.loop.exit(1)

    def render_next(self):
        item = unicode(self.render_queue.pop(0))
        self.current_item = item
        load_html(item, self.view)

    def collect(self, ok):
        if not ok:
            self.log.error("Failed to render document: %s" % self.container.relpath(self.current_item))
            self.loop.exit(1)
            return
        try:
            self.page.load_js()
            self.collect_font_stats()
        except:
            self.log.exception("Failed to collect font stats from: %s" % self.container.relpath(self.current_item))
            self.loop.exit(1)
            return

        self.render_book()

    def href_to_name(self, href, warn_name):
        if not href.startswith("file://"):
            self.log.warn("Non-local URI in", warn_name, ":", href, "ignoring")
            return None
        src = href[len("file://") :]
        if iswindows and len(src) > 2 and (src[0], src[2]) == ("/", ":"):
            src = src[1:]
        src = src.replace("/", os.sep)
        src = unquote(src)
        name = self.container.abspath_to_name(src)
        if not self.container.has_name(name):
            self.log.warn("Missing resource", href, "in", warn_name, "ignoring")
            return None
        return name

    def collect_font_stats(self):
        self.page.evaljs("window.font_stats.get_font_face_rules()")
        font_face_rules = self.page.bridge_value
        if not isinstance(font_face_rules, list):
            raise Exception("Unknown error occurred while reading font-face rules")

        # Weed out invalid font-face rules
        rules = []
        for rule in font_face_rules:
            ff = rule.get("font-family", None)
            if not ff:
                continue
            style = self.parser.parseStyle("font-family:%s" % ff, validate=False)
            ff = [x.value for x in style.getProperty("font-family").propertyValue]
            if not ff or ff[0] == "inherit":
                continue
            rule["font-family"] = frozenset(icu_lower(f) for f in ff)
            src = rule.get("src", None)
            if not src:
                continue
            style = self.parser.parseStyle("background-image:%s" % src, validate=False)
            src = style.getProperty("background-image").propertyValue[0].uri
            name = self.href_to_name(src, "@font-face rule")
            if name is None:
                continue
            rule["src"] = name
            normalize_font_properties(rule)
            rule["width"] = widths[rule["font-stretch"]]
            rule["weight"] = int(rule["font-weight"])
            rules.append(rule)

        if not rules and not self.do_embed:
            return

        self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules
        for rule in rules:
            self.all_font_rules[rule["src"]] = rule

        for rule in rules:
            if rule["src"] not in self.font_stats:
                self.font_stats[rule["src"]] = set()

        self.page.evaljs("window.font_stats.get_font_usage()")
        font_usage = self.page.bridge_value
        if not isinstance(font_usage, list):
            raise Exception("Unknown error occurred while reading font usage")
        exclude = {"\n", "\r", "\t"}
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict)
        bad_fonts = {"serif", "sans-serif", "monospace", "cursive", "fantasy", "sansserif", "inherit"}
        for font in font_usage:
            text = set()
            for t in font["text"]:
                text |= frozenset(t)
            text.difference_update(exclude)
            if not text:
                continue
            normalize_font_properties(font)
            for rule in get_matching_rules(rules, font):
                self.font_stats[rule["src"]] |= text
            if self.do_embed:
                ff = [icu_lower(x) for x in font.get("font-family", [])]
                if ff and ff[0] not in bad_fonts:
                    keys = {"font-weight", "font-style", "font-stretch", "font-family"}
                    key = frozenset(((k, ff[0] if k == "font-family" else v) for k, v in font.iteritems() if k in keys))
                    val = fu[key]
                    if not val:
                        val.update({k: (font[k][0] if k == "font-family" else font[k]) for k in keys})
                        val["text"] = set()
                    val["text"] |= text
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu)

        if self.do_embed:
            self.page.evaljs("window.font_stats.get_font_families()")
            font_families = self.page.bridge_value
            if not isinstance(font_families, dict):
                raise Exception("Unknown error occurred while reading font families")
            self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set()
            for raw in font_families.iterkeys():
                style = self.parser.parseStyle("font-family:" + raw, validate=False).getProperty("font-family")
                for x in style.propertyValue:
                    x = x.value
                    if x and x.lower() not in bad_fonts:
                        fs.add(x)
Esempio n. 25
0
 def __init__(self, el):
     self.el = el
     self.style = CSSParser().parseStyle(el.get('style'))
Esempio n. 26
0
 def __init__(self):
     self.css_parser = CSSParser()
Esempio n. 27
0
class StatsCollector(object):

    def __init__(self, container, do_embed=False):
        self.container = container
        self.log = self.logger = container.log
        self.do_embed = do_embed
        must_use_qt()
        self.parser = CSSParser(loglevel=logging.CRITICAL, log=logging.getLogger('calibre.css'))

        self.loop = QEventLoop()
        self.view = QWebView()
        self.page = Page(self.log)
        self.view.setPage(self.page)
        self.page.setViewportSize(QSize(1200, 1600))

        self.view.loadFinished.connect(self.collect,
                type=Qt.QueuedConnection)

        self.render_queue = list(container.spine_items)
        self.font_stats = {}
        self.font_usage_map = {}
        self.font_spec_map = {}
        self.font_rule_map = {}
        self.all_font_rules = {}

        QTimer.singleShot(0, self.render_book)

        if self.loop.exec_() == 1:
            raise Exception('Failed to gather statistics from book, see log for details')

    def render_book(self):
        try:
            if not self.render_queue:
                self.loop.exit()
            else:
                self.render_next()
        except:
            self.logger.exception('Rendering failed')
            self.loop.exit(1)

    def render_next(self):
        item = unicode(self.render_queue.pop(0))
        self.current_item = item
        load_html(item, self.view)

    def collect(self, ok):
        if not ok:
            self.log.error('Failed to render document: %s'%self.container.relpath(self.current_item))
            self.loop.exit(1)
            return
        try:
            self.page.load_js()
            self.collect_font_stats()
        except:
            self.log.exception('Failed to collect font stats from: %s'%self.container.relpath(self.current_item))
            self.loop.exit(1)
            return

        self.render_book()

    def href_to_name(self, href, warn_name):
        if not href.startswith('file://'):
            self.log.warn('Non-local URI in', warn_name, ':', href, 'ignoring')
            return None
        src = href[len('file://'):]
        if iswindows and len(src) > 2 and (src[0], src[2]) == ('/', ':'):
            src = src[1:]
        src = src.replace('/', os.sep)
        src = unquote(src)
        name = self.container.abspath_to_name(src)
        if not self.container.has_name(name):
            self.log.warn('Missing resource', href, 'in', warn_name,
                          'ignoring')
            return None
        return name

    def collect_font_stats(self):
        self.page.evaljs('window.font_stats.get_font_face_rules()')
        font_face_rules = self.page.bridge_value
        if not isinstance(font_face_rules, list):
            raise Exception('Unknown error occurred while reading font-face rules')

        # Weed out invalid font-face rules
        rules = []
        for rule in font_face_rules:
            ff = rule.get('font-family', None)
            if not ff:
                continue
            style = self.parser.parseStyle('font-family:%s'%ff, validate=False)
            ff = [x.value for x in
                  style.getProperty('font-family').propertyValue]
            if not ff or ff[0] == 'inherit':
                continue
            rule['font-family'] = frozenset(icu_lower(f) for f in ff)
            src = rule.get('src', None)
            if not src:
                continue
            if src.startswith('url(') and src.endswith(')') and src[4] not in {'"', "'"}:
                # Quote the url otherwise cssutils fails to parse it if it has
                # ' or " in it
                src = "url('" + src[4:-1].replace("'", "\\'") + "')"
            style = self.parser.parseStyle('background-image:%s'%src, validate=False)
            src = style.getProperty('background-image').propertyValue[0].uri
            name = self.href_to_name(src, '@font-face rule')
            if name is None:
                continue
            rule['src'] = name
            normalize_font_properties(rule)
            rule['width'] = widths[rule['font-stretch']]
            rule['weight'] = int(rule['font-weight'])
            rules.append(rule)

        if not rules and not self.do_embed:
            return

        self.font_rule_map[self.container.abspath_to_name(self.current_item)] = rules
        for rule in rules:
            self.all_font_rules[rule['src']] = rule

        for rule in rules:
            if rule['src'] not in self.font_stats:
                self.font_stats[rule['src']] = set()

        self.page.evaljs('window.font_stats.get_font_usage()')
        font_usage = self.page.bridge_value
        if not isinstance(font_usage, list):
            raise Exception('Unknown error occurred while reading font usage')
        exclude = {'\n', '\r', '\t'}
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = fu = defaultdict(dict)
        bad_fonts = {'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif', 'inherit'}
        for font in font_usage:
            text = set()
            for t in font['text']:
                text |= frozenset(t)
            text.difference_update(exclude)
            if not text:
                continue
            normalize_font_properties(font)
            for rule in get_matching_rules(rules, font):
                self.font_stats[rule['src']] |= text
            if self.do_embed:
                ff = [icu_lower(x) for x in font.get('font-family', [])]
                if ff and ff[0] not in bad_fonts:
                    keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}
                    key = frozenset(((k, ff[0] if k == 'font-family' else v) for k, v in font.iteritems() if k in keys))
                    val = fu[key]
                    if not val:
                        val.update({k:(font[k][0] if k == 'font-family' else font[k]) for k in keys})
                        val['text'] = set()
                    val['text'] |= text
        self.font_usage_map[self.container.abspath_to_name(self.current_item)] = dict(fu)

        if self.do_embed:
            self.page.evaljs('window.font_stats.get_font_families()')
            font_families = self.page.bridge_value
            if not isinstance(font_families, dict):
                raise Exception('Unknown error occurred while reading font families')
            self.font_spec_map[self.container.abspath_to_name(self.current_item)] = fs = set()
            for raw in font_families.iterkeys():
                style = self.parser.parseStyle('font-family:' + raw, validate=False).getProperty('font-family')
                for x in style.propertyValue:
                    x = x.value
                    if x and x.lower() not in bad_fonts:
                        fs.add(x)
Esempio n. 28
0
    def __init__(self, tree, path, oeb, opts, profile=None,
            extra_css='', user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'],
                                        profile['props'],
                                        profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style') and
                elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet, item.abshref,
                            ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                    'Stylesheet %r referenced by file %r is not CSS'%(path,
                        item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css':extra_css, 'user_css':user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text, href=cssname,
                            validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.'%w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {rule.media.item(i) for i in
                             xrange(rule.media.length)}
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)', re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                        'plumber_output_format', '').lower() == u'mobi':
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Esempio n. 29
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        style_tags = xpath(tree,
                           '//*[local-name()="style" or local-name()="link"]')

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in style_tags:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=True)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn(
                                    'Ignoring missing stylesheet in @import rule:',
                                    rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn(
                                    'CSS @import of non-CSS file %r' %
                                    rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    for rule in tuple(
                            stylesheet.cssRules.rulesOfType(
                                CSSRule.PAGE_RULE)):
                        stylesheet.cssRules.remove(rule)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for sheet_index, stylesheet in enumerate(stylesheets):
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {
                        rule.media.item(i)
                        for i in xrange(rule.media.length)
                    }
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(
                            self.flatten_rule(
                                subrule,
                                href,
                                index,
                                is_user_agent_sheet=sheet_index == 0))
                        index += 1
                else:
                    rules.extend(
                        self.flatten_rule(
                            rule,
                            href,
                            index,
                            is_user_agent_sheet=sheet_index == 0))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(
            ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)',
            re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                                                    'plumber_output_format',
                                                    '').lower() == u'mobi':
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)