Ejemplo n.º 1
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize,
                     item_id):
        if not isinstance(node.tag, string_or_bytes) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict[
                                'margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(XHTML("table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get(
                                    'margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == XHTML('font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color',
                                            node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (
            cssdict.get('float', None) == 'left' and 'font-size' in cssdict
            and len(node) == 0 and node.text and
            (len(node.text) == 1 or
             (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
            if dyn_rescale is not None:
                try:
                    dyn_rescale = float(dyn_rescale) / 100.0
                except Exception:
                    dyn_rescale = 1
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if not is_drop_cap and style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                classes_list = classes.split()
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub(
                    '', classes_list[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in iteritems(pseudo_classes):
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles,
                              psize, item_id)
Ejemplo n.º 2
0
def process_nav_node(container, node, toc_parent, nav_name):
    for li in node.iterchildren(XHTML('li')):
        child = add_from_li(container, li, toc_parent, nav_name)
        ol = first_child(li, XHTML('ol'))
        if child is not None and ol is not None:
            process_nav_node(container, ol, child, nav_name)
Ejemplo n.º 3
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css='',
                 base_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.body_font_size = self.profile.fbase
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        if base_css:
            stylesheets.append(parseString(base_css, validate=False))
        style_tags = xpath(tree,
                           '//*[local-name()="style" or local-name()="link"]')

        # Add css_parser parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        for elem in style_tags:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES
                    and media_ok(elem.get('media'))):
                text = elem.text if elem.text else ''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += '\n\n' + force_unicode(t, 'utf-8')
                if text:
                    text = oeb.css_preprocessor(text)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if not media_ok(rule.media.mediaText):
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn(
                                    'Ignoring missing stylesheet in @import rule:',
                                    rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn(
                                    'CSS @import of non-CSS file %r' %
                                    rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif (elem.tag == XHTML('link') and elem.get('href')
                  and elem.get('rel', 'stylesheet').lower() == 'stylesheet'
                  and elem.get('type', CSS_MIME).lower() in OEB_STYLES
                  and media_ok(elem.get('media'))):
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheets.append(stylesheet)
                except Exception:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)

        # using oeb to store the rules, page rule and font face rules
        # and generating them again if opts, profile or stylesheets are different
        if (not hasattr(self.oeb, 'stylizer_rules')) \
            or not self.oeb.stylizer_rules.same_rules(self.opts, self.profile, stylesheets):
            self.oeb.stylizer_rules = StylizerRules(self.opts, self.profile,
                                                    stylesheets)
        self.rules = self.oeb.stylizer_rules.rules
        self.page_rule = self.oeb.stylizer_rules.page_rule
        self.font_face_rules = self.oeb.stylizer_rules.font_face_rules
        self.flatten_style = self.oeb.stylizer_rules.flatten_style

        self._styles = {}
        pseudo_pat = re.compile(
            ':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)
        select = Select(tree, ignore_inappropriate_pseudo_classes=True)

        for _, _, cssdict, text, _ in self.rules:
            fl = pseudo_pat.search(text)
            try:
                matches = tuple(select(text))
            except SelectorError as err:
                self.logger.error(
                    'Ignoring CSS rule with invalid selector: %r (%s)' %
                    (text, as_unicode(err)))
                continue

            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(
                        self.oeb, 'plumber_output_format',
                        '').lower() in {'mobi', 'docx'}:
                    # Fake first-letter
                    for elem in matches:
                        for x in elem.iter('*'):
                            if x.text:
                                punctuation_chars = []
                                text = unicode_type(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = ''.join(punctuation_chars) + \
                                        (text[0] if text else '')
                                span = x.makeelement('{%s}span' % XHTML_NS)
                                span.text = special_text
                                span.set('data-fake-first-letter', '1')
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'[0-9.]+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Ejemplo n.º 4
0
    def epubify_markup(self, root, log):
        from calibre.ebooks.oeb.base import XPath, XHTML
        # Fix empty title tags
        for t in XPath('//h:title')(root):
            if not t.text:
                t.text = u' '
        # Fix <p><div> constructs as the asinine epubchecker complains
        # about them
        pdiv = XPath('//h:p/h:div')
        for div in pdiv(root):
            div.getparent().tag = XHTML('div')

        # Remove the position:relative as it causes problems with some epub
        # renderers. Remove display: block on an image inside a div as it is
        # redundant and prevents text-align:center from working in ADE
        # Also ensure that the img is contained in its containing div
        imgpath = XPath('//h:div/h:img[@style]')
        for img in imgpath(root):
            div = img.getparent()
            if len(div) == 1:
                style = div.attrib.get('style', '')
                if style and not style.endswith(';'):
                    style = style + ';'
                style += 'position:static'  # Ensures position of containing div is static
                # Ensure that the img is always contained in its frame
                div.attrib['style'] = style
                img.attrib['style'] = 'max-width: 100%; max-height: 100%'

        # Handle anchored images. The default markup + CSS produced by
        # odf2xhtml works with WebKit but not with ADE. So we convert the
        # common cases of left/right/center aligned block images to work on
        # both webkit and ADE. We detect the case of setting the side margins
        # to auto and map it to an appropriate text-align directive, which
        # works in both WebKit and ADE.
        # https://bugs.launchpad.net/bugs/1063207
        # https://bugs.launchpad.net/calibre/+bug/859343
        imgpath = XPath('descendant::h:div/h:div/h:img')
        for img in imgpath(root):
            div2 = img.getparent()
            div1 = div2.getparent()
            if (len(div1), len(div2)) != (1, 1):
                continue
            cls = div1.get('class', '')
            first_rules = filter(
                None, [self.get_css_for_class(x) for x in cls.split()])
            has_align = False
            for r in first_rules:
                if r.style.getProperty(u'text-align') is not None:
                    has_align = True
            ml = mr = None
            if not has_align:
                aval = None
                cls = div2.get(u'class', u'')
                rules = filter(
                    None, [self.get_css_for_class(x) for x in cls.split()])
                for r in rules:
                    ml = r.style.getPropertyCSSValue(u'margin-left') or ml
                    mr = r.style.getPropertyCSSValue(u'margin-right') or mr
                    ml = getattr(ml, 'value', None)
                    mr = getattr(mr, 'value', None)
                if ml == mr == u'auto':
                    aval = u'center'
                elif ml == u'auto' and mr != u'auto':
                    aval = 'right'
                elif ml != u'auto' and mr == u'auto':
                    aval = 'left'
                if aval is not None:
                    style = div1.attrib.get('style', '').strip()
                    if style and not style.endswith(';'):
                        style = style + ';'
                    style += 'text-align:%s' % aval
                    has_align = True
                    div1.attrib['style'] = style

            if has_align:
                # This is needed for ADE, without it the text-align has no
                # effect
                style = div2.attrib['style']
                div2.attrib['style'] = 'display:inline;' + style
Ejemplo n.º 5
0
    def mobimlize_content(self, tag, text, bstate, istates):
        'Convert text content'
        if text or tag != 'br':
            bstate.content = True
        istate = istates[-1]
        para = bstate.para
        if tag in SPECIAL_TAGS and not text:
            para = para if para is not None else bstate.body
        elif para is None or tag in ('td', 'th'):
            body = bstate.body
            if bstate.pbreak:
                etree.SubElement(body, MBP('pagebreak'))
                bstate.pbreak = False
            bstate.istate = None
            bstate.anchor = None
            parent = bstate.nested[-1] if bstate.nested else bstate.body
            indent = istate.indent
            left = istate.left
            if isinstance(indent, basestring):
                indent = 0
            if indent < 0 and abs(indent) < left:
                left += indent
                indent = 0
            elif indent != 0 and abs(indent) < self.profile.fbase:
                indent = (indent / abs(indent)) * self.profile.fbase
            if tag in NESTABLE_TAGS and not istate.rendered:
                para = wrapper = etree.SubElement(parent,
                                                  XHTML(tag),
                                                  attrib=istate.attrib)
                bstate.nested.append(para)
                if tag == 'li' and len(istates) > 1:
                    istates[-2].list_num += 1
                    para.attrib['value'] = str(istates[-2].list_num)
            elif tag in NESTABLE_TAGS and istate.rendered:
                para = wrapper = bstate.nested[-1]
            elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0:
                ems = self.profile.mobi_ems_per_blockquote
                para = wrapper = etree.SubElement(parent, XHTML('blockquote'))
                para = wrapper
                emleft = int(round(left / self.profile.fbase)) - ems
                emleft = min((emleft, 10))
                while emleft > ems / 2.0:
                    para = etree.SubElement(para, XHTML('blockquote'))
                    emleft -= ems
            else:
                para = wrapper = etree.SubElement(parent, XHTML('p'))
            bstate.inline = bstate.para = para
            vspace = bstate.vpadding + bstate.vmargin
            bstate.vpadding = bstate.vmargin = 0
            if tag not in TABLE_TAGS:
                if tag in ('ul', 'ol') and vspace > 0:
                    wrapper.addprevious(
                        etree.Element(XHTML('div'),
                                      height=self.mobimlize_measure(vspace)))
                else:
                    wrapper.attrib['height'] = self.mobimlize_measure(vspace)
                para.attrib['width'] = self.mobimlize_measure(indent)
            elif tag == 'table' and vspace > 0:
                vspace = int(round(vspace / self.profile.fbase))
                while vspace > 0:
                    wrapper.addprevious(etree.Element(XHTML('br')))
                    vspace -= 1
            if istate.halign != 'auto' and isinstance(istate.halign,
                                                      (str, unicode)):
                para.attrib['align'] = istate.halign
        istate.rendered = True
        pstate = bstate.istate
        if tag in CONTENT_TAGS:
            bstate.inline = para
            pstate = bstate.istate = None
            try:
                etree.SubElement(para, XHTML(tag), attrib=istate.attrib)
            except:
                print 'Invalid subelement:', para, tag, istate.attrib
                raise
        elif tag in TABLE_TAGS:
            para.attrib['valign'] = 'top'
        if istate.ids:
            for id_ in istate.ids:
                anchor = etree.Element(XHTML('a'), attrib={'id': id_})
                if tag == 'li':
                    try:
                        last = bstate.body[-1][-1]
                    except:
                        break
                    last.insert(0, anchor)
                    anchor.tail = last.text
                    last.text = None
                else:
                    last = bstate.body[-1]
                    # We use append instead of addprevious so that inline
                    # anchors in large blocks point to the correct place. See
                    # https://bugs.launchpad.net/calibre/+bug/899831
                    # This could potentially break if inserting an anchor at
                    # this point in the markup is illegal, but I cannot think
                    # of such a case offhand.
                    if barename(last.tag) in LEAF_TAGS:
                        last.addprevious(anchor)
                    else:
                        last.append(anchor)

            istate.ids.clear()
        if not text:
            return
        if not pstate or istate != pstate:
            inline = para
            fsize = istate.fsize
            href = istate.href
            if not href:
                bstate.anchor = None
            elif pstate and pstate.href == href:
                inline = bstate.anchor
            else:
                inline = etree.SubElement(inline, XHTML('a'), href=href)
                bstate.anchor = inline

            if fsize != 3:
                inline = etree.SubElement(inline,
                                          XHTML('font'),
                                          size=str(fsize))
            if istate.family == 'monospace':
                inline = etree.SubElement(inline, XHTML('tt'))
            if istate.italic:
                inline = etree.SubElement(inline, XHTML('i'))
            if istate.bold:
                inline = etree.SubElement(inline, XHTML('b'))
            if istate.bgcolor is not None and istate.bgcolor != 'transparent':
                inline = etree.SubElement(inline,
                                          XHTML('span'),
                                          bgcolor=istate.bgcolor)
            if istate.fgcolor != 'black':
                inline = etree.SubElement(inline,
                                          XHTML('font'),
                                          color=unicode(istate.fgcolor))
            if istate.strikethrough:
                inline = etree.SubElement(inline, XHTML('s'))
            if istate.underline:
                inline = etree.SubElement(inline, XHTML('u'))
            bstate.inline = inline
        bstate.istate = istate
        inline = bstate.inline
        content = self.preize_text(text) if istate.preserve else [text]
        for item in content:
            if isinstance(item, basestring):
                if len(inline) == 0:
                    inline.text = (inline.text or '') + item
                else:
                    last = inline[-1]
                    last.tail = (last.tail or '') + item
            else:
                inline.append(item)
Ejemplo n.º 6
0
    def workaround_ade_quirks(self):  # {{{
        '''
        Perform various markup transforms to get the output to render correctly
        in the quirky ADE.
        '''
        from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote

        stylesheet = self.oeb.manifest.main_stylesheet

        # ADE cries big wet tears when it encounters an invalid fragment
        # identifier in the NCX toc.
        frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
        for node in self.oeb.toc.iter():
            href = getattr(node, 'href', None)
            if hasattr(href, 'partition'):
                base, _, frag = href.partition('#')
                frag = urlunquote(frag)
                if frag and frag_pat.match(frag) is None:
                    self.log.warn(
                        'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'
                        % frag)
                    node.href = base

        for x in self.oeb.spine:
            root = x.data
            body = XPath('//h:body')(root)
            if body:
                body = body[0]

            if hasattr(body, 'xpath'):
                # remove <img> tags with empty src elements
                bad = []
                for x in XPath('//h:img')(body):
                    src = x.get('src', '').strip()
                    if src in ('', '#') or src.startswith('http:'):
                        bad.append(x)
                for img in bad:
                    img.getparent().remove(img)

                # Add id attribute to <a> tags that have name
                for x in XPath('//h:a[@name]')(body):
                    if not x.get('id', False):
                        x.set('id', x.get('name'))
                    # The delightful epubcheck has started complaining about <a> tags that
                    # have name attributes.
                    x.attrib.pop('name')

                # Replace <br> that are children of <body> as ADE doesn't handle them
                for br in XPath('./h:br')(body):
                    if br.getparent() is None:
                        continue
                    try:
                        prior = next(br.itersiblings(preceding=True))
                        priortag = barename(prior.tag)
                        priortext = prior.tail
                    except:
                        priortag = 'body'
                        priortext = body.text
                    if priortext:
                        priortext = priortext.strip()
                    br.tag = XHTML('p')
                    br.text = '\u00a0'
                    style = br.get('style', '').split(';')
                    style = list(filter(None, map(lambda x: x.strip(), style)))
                    style.append('margin:0pt; border:0pt')
                    # If the prior tag is a block (including a <br> we replaced)
                    # then this <br> replacement should have a 1-line height.
                    # Otherwise it should have no height.
                    if not priortext and priortag in block_level_tags:
                        style.append('height:1em')
                    else:
                        style.append('height:0pt')
                    br.set('style', '; '.join(style))

            for tag in XPath('//h:embed')(root):
                tag.getparent().remove(tag)
            for tag in XPath('//h:object')(root):
                if tag.get('type', '').lower().strip() in {
                        'image/svg+xml', 'application/svg+xml'
                }:
                    continue
                tag.getparent().remove(tag)

            for tag in XPath('//h:title|//h:style')(root):
                if not tag.text:
                    tag.getparent().remove(tag)
            for tag in XPath('//h:script')(root):
                if (not tag.text and not tag.get('src', False)
                        and tag.get('type', None) != 'text/x-mathjax-config'):
                    tag.getparent().remove(tag)
            for tag in XPath('//h:body/descendant::h:script')(root):
                tag.getparent().remove(tag)

            formchildren = XPath('./h:input|./h:button|./h:textarea|'
                                 './h:label|./h:fieldset|./h:legend')
            for tag in XPath('//h:form')(root):
                if formchildren(tag):
                    tag.getparent().remove(tag)
                else:
                    # Not a real form
                    tag.tag = XHTML('div')

            for tag in XPath('//h:center')(root):
                tag.tag = XHTML('div')
                tag.set('style', 'text-align:center')
            # ADE can't handle &amp; in an img url
            for tag in XPath('//h:img[@src]')(root):
                tag.set('src', tag.get('src', '').replace('&', ''))

            # ADE whimpers in fright when it encounters a <td> outside a
            # <table>
            in_table = XPath('ancestor::h:table')
            for tag in XPath('//h:td|//h:tr|//h:th')(root):
                if not in_table(tag):
                    tag.tag = XHTML('div')

            # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
            special_chars = re.compile('[\u200b\u00ad]')
            for elem in root.iterdescendants('*'):
                if elem.text:
                    elem.text = special_chars.sub('', elem.text)
                    elem.text = elem.text.replace('\u2011', '-')
                if elem.tail:
                    elem.tail = special_chars.sub('', elem.tail)
                    elem.tail = elem.tail.replace('\u2011', '-')

            if stylesheet is not None:
                # ADE doesn't render lists correctly if they have left margins
                from css_parser.css import CSSRule
                for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
                    sel = '.' + lb.get('class')
                    for rule in stylesheet.data.cssRules.rulesOfType(
                            CSSRule.STYLE_RULE):
                        if sel == rule.selectorList.selectorText:
                            rule.style.removeProperty('margin-left')
                            # padding-left breaks rendering in webkit and gecko
                            rule.style.removeProperty('padding-left')
                # Change whitespace:pre to pre-wrap to accommodate readers that
                # cannot scroll horizontally
                for rule in stylesheet.data.cssRules.rulesOfType(
                        CSSRule.STYLE_RULE):
                    style = rule.style
                    ws = style.getPropertyValue('white-space')
                    if ws == 'pre':
                        style.setProperty('white-space', 'pre-wrap')
Ejemplo n.º 7
0
 def process_nav_node(node, toc_parent):
     for li in node.iterchildren(XHTML('li')):
         child = add_from_li(li, toc_parent)
         ol = first_child(li, XHTML('ol'))
         if child is not None and ol is not None:
             process_nav_node(ol, child)
Ejemplo n.º 8
0
def render_jacket(mi,
                  output_profile,
                  alt_title=_('Unknown'),
                  alt_tags=[],
                  alt_comments='',
                  alt_publisher=(''),
                  rescale_fonts=False):
    css = P('jacket/stylesheet.css', data=True).decode('utf-8')
    template = P('jacket/template.xhtml', data=True).decode('utf-8')

    template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)

    try:
        title_str = mi.title if mi.title else alt_title
    except:
        title_str = _('Unknown')
    title_str = escape(title_str)
    title = '<span class="title">%s</span>' % title_str

    series = Series(mi.series, mi.series_index)
    try:
        publisher = mi.publisher if mi.publisher else alt_publisher
    except:
        publisher = ''
    publisher = escape(publisher)

    try:
        if is_date_undefined(mi.pubdate):
            pubdate = ''
        else:
            pubdate = strftime('%Y', mi.pubdate.timetuple())
    except:
        pubdate = ''

    rating = get_rating(mi.rating, output_profile.ratings_char,
                        output_profile.empty_ratings_char)

    tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)

    comments = mi.comments if mi.comments else alt_comments
    comments = comments.strip()
    orig_comments = comments
    if comments:
        comments = comments_to_html(comments)

    try:
        author = mi.format_authors()
    except:
        author = ''
    author = escape(author)

    def generate_html(comments):
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_('Published'),
            pubdate=pubdate,
            series_label=_('Series'),
            series=series,
            rating_label=_('Rating'),
            rating=rating,
            tags_label=_('Tags'),
            tags=tags,
            comments=comments,
            footer='',
            searchable_tags=' '.join(
                escape(t) + 'ttt' for t in tags.tags_list),
        )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(
                        mi.get(key),
                        m.get('display', {}).get('allow_half_stars', False))
                else:
                    args[dkey] = escape(val)
                args[dkey + '_label'] = escape(display_name)
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in list(args.keys()):
                if key.startswith('_') and not key.endswith('_label'):
                    print((" %s: %s" % ('#' + key[1:], args[key])))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)

        # Post-process the generated html to strip out empty header items

        soup = BeautifulSoup(generated_html)
        if not series:
            series_tag = soup.find(attrs={'class': 'cbj_series'})
            if series_tag is not None:
                series_tag.extract()
        if not rating:
            rating_tag = soup.find(attrs={'class': 'cbj_rating'})
            if rating_tag is not None:
                rating_tag.extract()
        if not tags:
            tags_tag = soup.find(attrs={'class': 'cbj_tags'})
            if tags_tag is not None:
                tags_tag.extract()
        if not pubdate:
            pubdate_tag = soup.find(attrs={'class': 'cbj_pubdata'})
            if pubdate_tag is not None:
                pubdate_tag.extract()
        if output_profile.short_name != 'kindle':
            hr_tag = soup.find('hr', attrs={'class': 'cbj_kindle_banner_hr'})
            if hr_tag is not None:
                hr_tag.extract()

        return strip_encoding_declarations(
            soup.renderContents('utf-8').decode('utf-8'))

    from calibre.ebooks.oeb.base import RECOVER_PARSER

    try:
        root = etree.fromstring(generate_html(comments), parser=RECOVER_PARSER)
    except:
        try:
            root = etree.fromstring(generate_html(escape(orig_comments)),
                                    parser=RECOVER_PARSER)
        except:
            root = etree.fromstring(generate_html(''), parser=RECOVER_PARSER)
    if rescale_fonts:
        # We ensure that the conversion pipeline will set the font sizes for
        # text in the jacket to the same size as the font sizes for the rest of
        # the text in the book. That means that as long as the jacket uses
        # relative font sizes (em or %), the post conversion font size will be
        # the same as for text in the main book. So text with size x em will
        # be rescaled to the same value in both the jacket and the main content.
        #
        # We cannot use calibre_rescale_100 on the body tag as that will just
        # give the body tag a font size of 1em, which is useless.
        for body in root.xpath('//*[local-name()="body"]'):
            fw = body.makeelement(XHTML('div'))
            fw.set('class', 'calibre_rescale_100')
            for child in body:
                fw.append(child)
            body.append(fw)
    from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
    pretty_html_tree(None, root)
    return root
Ejemplo n.º 9
0
def resolve_styles(container, name, select=None, sheet_callback=None):
    root = container.parsed(name)
    select = select or Select(root, ignore_inappropriate_pseudo_classes=True)
    style_map = defaultdict(list)
    pseudo_style_map = defaultdict(list)
    rule_index_counter = count()
    pseudo_pat = re.compile(
        u':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I)

    def process_sheet(sheet, sheet_name):
        if sheet_callback is not None:
            sheet_callback(sheet, sheet_name)
        for rule, sheet_name, rule_index in iterrules(
                container,
                sheet_name,
                rules=sheet,
                rule_index_counter=rule_index_counter,
                rule_type='STYLE_RULE'):
            for selector in rule.selectorList:
                text = selector.selectorText
                try:
                    matches = tuple(select(text))
                except SelectorError as err:
                    container.log.error(
                        'Ignoring CSS rule with invalid selector: %r (%s)' %
                        (text, as_unicode(err)))
                    continue
                m = pseudo_pat.search(text)
                style = normalize_style_declaration(rule.style, sheet_name)
                if m is None:
                    for elem in matches:
                        style_map[elem].append(
                            StyleDeclaration(specificity(rule_index, selector),
                                             style, None))
                else:
                    for elem in matches:
                        pseudo_style_map[elem].append(
                            StyleDeclaration(specificity(rule_index, selector),
                                             style, m.group(1)))

    process_sheet(html_css_stylesheet(container), 'user-agent.css')

    for elem in root.iterdescendants(XHTML('style'), XHTML('link')):
        if elem.tag.lower().endswith('style'):
            if not elem.text:
                continue
            sheet = container.parse_css(elem.text)
            sheet_name = name
        else:
            if (elem.get('type') or 'text/css').lower() not in OEB_STYLES or \
                    (elem.get('rel') or 'stylesheet').lower() != 'stylesheet' or \
                    not media_ok(elem.get('media')):
                continue
            href = elem.get('href')
            if not href:
                continue
            sheet_name = container.href_to_name(href, name)
            if not container.has_name(sheet_name):
                continue
            sheet = container.parsed(sheet_name)
            if not isinstance(sheet, CSSStyleSheet):
                continue
        process_sheet(sheet, sheet_name)

    for elem in root.xpath('//*[@style]'):
        text = elem.get('style')
        if text:
            style = container.parse_css(text, is_declaration=True)
            style_map[elem].append(
                StyleDeclaration(Specificity(1, 0, 0, 0, 0),
                                 normalize_style_declaration(style, name),
                                 None))

    for l in (style_map, pseudo_style_map):
        for x in l.itervalues():
            x.sort(key=itemgetter(0), reverse=True)

    style_map = {
        elem: resolve_declarations(x)
        for elem, x in style_map.iteritems()
    }
    pseudo_style_map = {
        elem: resolve_pseudo_declarations(x)
        for elem, x in pseudo_style_map.iteritems()
    }

    return partial(resolve_property,
                   style_map), partial(resolve_pseudo_property, style_map,
                                       pseudo_style_map), select
Ejemplo n.º 10
0
def embed_all_fonts(container, stats, report):
    all_font_rules = tuple(itervalues(stats.all_font_rules))
    warned = set()
    rules, nrules = [], {}
    modified = set()

    for path in container.spine_items:
        name = container.abspath_to_name(path)
        fu = stats.font_usage_map.get(name, None)
        fs = stats.font_spec_map.get(name, None)
        fr = stats.font_rule_map.get(name, None)
        if None in (fs, fu, fr):
            continue
        fs = {icu_lower(x) for x in fs}
        for font in itervalues(fu):
            if icu_lower(font['font-family']) not in fs:
                continue
            rule = matching_rule(font, fr)
            if rule is None:
                # This font was not already embedded in this HTML file, before
                # processing started
                key = font_key(font)
                rule = nrules.get(key)
                if rule is None:
                    rule = embed_font(container, font, all_font_rules, report,
                                      warned)
                    if rule is not None:
                        rules.append(rule)
                        nrules[key] = rule
                        modified.add(name)
                        stats.font_stats[rule['name']] = font['text']
                else:
                    # This font was previously embedded by this code, update its stats
                    stats.font_stats[rule['name']] |= font['text']
                    modified.add(name)

    if not rules:
        report(_('No embeddable fonts found'))
        return False

    # Write out CSS
    rules = [
        ';\n\t'.join(
            '{}: {}'.format(k, '"%s"' % v if k == 'font-family' else v)
            for k, v in iteritems(rulel)
            if (k in props and props[k] != v and v != '400') or k == 'src')
        for rulel in rules
    ]
    css = '\n\n'.join(['@font-face {\n\t%s\n}' % r for r in rules])
    item = container.generate_item('fonts.css', id_prefix='font_embed')
    name = container.href_to_name(item.get('href'), container.opf_name)
    with container.open(name, 'wb') as out:
        out.write(css.encode('utf-8'))

    # Add link to CSS in all files that need it
    for spine_name in modified:
        root = container.parsed(spine_name)
        head = root.xpath('//*[local-name()="head"][1]')[0]
        href = container.name_to_href(name, spine_name)
        etree.SubElement(head,
                         XHTML('link'),
                         rel='stylesheet',
                         type='text/css',
                         href=href).tail = '\n'
        container.dirty(spine_name)
    return True
Ejemplo n.º 11
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            # Use the default profile. This should really be using
            # opts.output_profile, but I don't want to risk changing it, as
            # doing so might well have hard to debug font size effects.
            from calibre.customize.ui import output_profiles
            for x in output_profiles():
                if x.short_name == 'default':
                    self.profile = x
                    break
        if self.profile is None:
            # Just in case the default profile is removed in the future :)
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = XHTML_CSS_NAMESPACE + text
                    text = oeb.css_preprocessor(text)
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                rules.extend(self.flatten_rule(rule, href, index))
                index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        for _, _, cssdict, text, _ in rules:
            fl = ':first-letter' in text
            if fl:
                text = text.replace(':first-letter', '')
            selector = get_css_selector(text)
            matches = selector(tree, self.logger)
            if fl:
                from lxml.builder import ElementMaker
                E = ElementMaker(namespace=XHTML_NS)
                for elem in matches:
                    for x in elem.iter():
                        if x.text:
                            punctuation_chars = []
                            text = unicode(x.text)
                            while text:
                                if not unicodedata.category(
                                        text[0]).startswith('P'):
                                    break
                                punctuation_chars.append(text[0])
                                text = text[1:]

                            special_text = u''.join(punctuation_chars) + \
                                    (text[0] if text else u'')
                            span = E.span(special_text)
                            span.tail = text[1:]
                            x.text = None
                            x.insert(0, span)
                            self.style(span)._update_cssdict(cssdict)
                            break
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)
Ejemplo n.º 12
0
    def transform_html(self, name, virtualize_resources):
        style_xpath = XPath('//h:style')
        link_xpath = XPath('//h:a[@href]')
        img_xpath = XPath('//h:img[@src]')
        res_link_xpath = XPath('//h:link[@href]')
        root = self.parsed(name)
        head = ensure_head(root)
        changed = False
        for style in style_xpath(root):
            # Firefox flakes out sometimes when dynamically creating <style> tags,
            # so convert them to external stylesheets to ensure they never fail
            if style.text and (style.get('type')
                               or 'text/css').lower() == 'text/css':
                in_head = has_ancestor(style, head)
                if not in_head:
                    extract(style)
                    head.append(style)
                css = style.text
                style.clear()
                style.tag = XHTML('link')
                style.set('type', 'text/css')
                style.set('rel', 'stylesheet')
                sname = self.add_file(name + '.css',
                                      css.encode('utf-8'),
                                      modify_name_if_needed=True)
                style.set('href', self.name_to_href(sname, name))
                changed = True

        # Used for viewing images
        for img in img_xpath(root):
            img_name = self.href_to_name(img.get('src'), name)
            if img_name:
                img.set('data-calibre-src', img_name)
                changed = True

        # Disable non stylsheet link tags. This link will not be loaded by the
        # browser anyway and will causes the resource load check to hang
        for link in res_link_xpath(root):
            ltype = (link.get('type') or 'text/css').lower()
            rel = (link.get('rel') or 'stylesheet').lower()
            if ltype != 'text/css' or rel != 'stylesheet':
                link.attrib.clear()
                changed = True

        # Transform <style> and style=""
        if transform_inline_styles(self,
                                   name,
                                   transform_sheet=transform_sheet,
                                   transform_style=transform_declaration):
            changed = True

        if not virtualize_resources:
            link_uid = self.book_render_data['link_uid']
            link_replacer = create_link_replacer(self, link_uid, set())
            ltm = self.book_render_data['link_to_map']
            for a in link_xpath(root):
                href = link_replacer(name, a.get('href'))
                if href and href.startswith(link_uid):
                    a.set('href', 'javascript:void(0)')
                    parts = decode_url(href.split('|')[1])
                    lname, lfrag = parts[0], parts[1]
                    ltm.setdefault(lname, {}).setdefault(lfrag or '',
                                                         set()).add(name)
                    a.set(
                        'data-' + link_uid,
                        json.dumps({
                            'name': lname,
                            'frag': lfrag
                        },
                                   ensure_ascii=False))
                    changed = True

        if changed:
            self.dirty(name)
Ejemplo n.º 13
0
class RecodeCallbackBase:
    class __metaclass__(type):
        def __init__(cls, name, bases, attrs):
            type.__init__(cls, name, bases, attrs)
            RecodeCallbackRegistry.register(cls)

    # NOTE: body is our first document structure tag, so it is safe to use
    #       it as a callback base example
    tag = XHTML('body')

    def __init__(self, converter, logger):
        self.converter = converter
        self.log = logger

        # reference counter used for tracking tags nesting
        self.refcount = 0

        # internal stack for data stashing
        self.datastack = []

    def start(self, element):
        """Element recoding entry method."""
        return self.get_begin(element) + self.get_text(element)

    def stop(self, element):
        """Element recoding exit method."""
        return self.get_end(element) + self.get_tail(element)

    def push(self, data):
        """Push data onto the internal stack storage."""
        self.datastack.append(data)

    def pop(self):
        """Pop data from the internal stack storage."""
        return self.datastack.pop()

    @classmethod
    def get_begin(cls, element):
        return ""

    @classmethod
    def get_end(cls, element):
        return ""

    @classmethod
    def get_text(cls, element):
        return cls.sanitize(element.text or "")

    @classmethod
    def get_tail(cls, element):
        return cls.sanitize(element.tail or "")

    @staticmethod
    def sanitize(text):
        """
        Return the given text with sanitized TeX special characters.

        There is ten characters which have special meaning in the TeX
        document, plus a newline which controls document structure.
        The list of all of them:
        &, %, $, #, _, {, }, ~, ^, backslash and newline
        """
        text = re.sub(r'\\', r'\\textbackslash{}', text)
        text = re.sub(r'\^', r'\\textasciicircum{}', text)
        text = re.sub(r'~', r'\\textasciitilde{}', text)
        text = re.sub(r'[&%$#_{}]', r'\\\g<0>', text)
        return re.sub(r'[\r\n]+', r' ', text)

    @staticmethod
    def get_classes(element):
        """Get classes set attached to the given element."""
        return set(element.attrib.get('class', "").split())

    @staticmethod
    def get_class_style(classes):
        """
        Get style functions. In the OEB file format, style is encoded in the
        class attribute.

        That method modifies input argument - used classes are removed.
        """
        functions = []

        for cls in classes:
            if cls == 'bold':
                functions.append("\\textbf{")
            elif cls == 'italic':
                functions.append("\\emph{")
            elif cls == 'underline':
                functions.append("\\uline{")

        # remove used (recognized) classes
        classes.difference_update((
            'bold',
            'italic',
            'underline',
        ))
        return functions

    @staticmethod
    def get_class_layout(classes):
        """
        Get layout functions. In the OEB file format layout may be encoded in
        the class attribute, e.g. page-break.

        That method modifies input argument - used classes are removed.
        """
        functions = []

        for cls in classes:
            if cls == 'mbppagebreak':
                functions.append("\\chapter*{")

        # remove used (recognized) classes
        classes.difference_update(('mbppagebreak', ))
        return functions
Ejemplo n.º 14
0
    def extract_css_into_flows(self):
        inlines = defaultdict(list)  # Ensure identical <style>s not repeated
        sheets = {}

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if not self.opts.expand_css and hasattr(item.data, 'cssText'):
                    condense_sheet(sheet)
                sheets[item.href] = len(self.flows)
                self.flows.append(sheet)

        def fix_import_rules(sheet):
            changed = False
            for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
                if rule.href:
                    href = item.abshref(rule.href)
                    idx = sheets.get(href, None)
                    if idx is not None:
                        idx = to_ref(idx)
                        rule.href = 'kindle:flow:%s?mime=text/css' % idx
                        changed = True
            return changed

        for item in self.oeb.spine:
            root = self.data(item)

            for link in XPath('//h:link[@href]')(root):
                href = item.abshref(link.get('href'))
                idx = sheets.get(href, None)
                if idx is not None:
                    idx = to_ref(idx)
                    link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

            for tag in XPath('//h:style')(root):
                p = tag.getparent()
                idx = p.index(tag)
                raw = tag.text
                if not raw or not raw.strip():
                    extract(tag)
                    continue
                sheet = cssutils.parseString(raw, validate=False)
                if fix_import_rules(sheet):
                    raw = force_unicode(sheet.cssText, 'utf-8')

                repl = etree.Element(XHTML('link'),
                                     type='text/css',
                                     rel='stylesheet')
                repl.tail = '\n'
                p.insert(idx, repl)
                extract(tag)
                inlines[raw].append(repl)

        for raw, elems in inlines.iteritems():
            idx = to_ref(len(self.flows))
            self.flows.append(raw)
            for link in elems:
                link.set('href', 'kindle:flow:%s?mime=text/css' % idx)

        for item in self.oeb.manifest:
            if item.media_type in OEB_STYLES:
                sheet = self.data(item)
                if hasattr(sheet, 'cssRules'):
                    fix_import_rules(sheet)

        for i, sheet in enumerate(tuple(self.flows)):
            if hasattr(sheet, 'cssText'):
                self.flows[i] = force_unicode(sheet.cssText, 'utf-8')
Ejemplo n.º 15
0
    def read_inline_toc(self, href, frag):
        ans = TOC()
        base_href = '/'.join(href.split('/')[:-1])
        with open(href.replace('/', os.sep), 'rb') as f:
            raw = f.read().decode(self.header.codec)
        root = parse_html(raw, log=self.log)
        body = XPath('//h:body')(root)
        reached = False
        if body:
            start = body[0]
        else:
            start = None
            reached = True
        if frag:
            elems = XPath('//*[@id="%s"]'%frag)(root)
            if elems:
                start = elems[0]

        def node_depth(elem):
            ans = 0
            parent = elem.getparent()
            while parent is not None:
                parent = parent.getparent()
                ans += 1
            return ans

        # Layer the ToC based on nesting order in the source HTML
        current_depth = None
        parent = ans
        seen = set()
        links = []
        for elem in root.iterdescendants(etree.Element):
            if reached and elem.tag == XHTML('a') and elem.get('href',
                    False):
                href = elem.get('href')
                href, frag = urldefrag(href)
                href = base_href + '/' + href
                text = xml2text(elem).strip()
                if (text, href, frag) in seen:
                    continue
                seen.add((text, href, frag))
                links.append((text, href, frag, node_depth(elem)))
            elif elem is start:
                reached = True

        depths = sorted(set(x[-1] for x in links))
        depth_map = {x:i for i, x in enumerate(depths)}
        for text, href, frag, depth in links:
            depth = depth_map[depth]
            if current_depth is None:
                current_depth = 0
                parent.add_item(href, frag, text)
            elif current_depth == depth:
                parent.add_item(href, frag, text)
            elif current_depth < depth:
                parent = parent[-1] if len(parent) > 0 else parent
                parent.add_item(href, frag, text)
                current_depth += 1
            else:
                delta = current_depth - depth
                while delta > 0 and parent.parent is not None:
                    parent = parent.parent
                    delta -= 1
                parent.add_item(href, frag, text)
                current_depth = depth
        return ans
Ejemplo n.º 16
0
from lxml import etree
from PyQt4.QtCore import Qt
from PyQt4.QtCore import QByteArray
from PyQt4.QtCore import QBuffer
from PyQt4.QtCore import QIODevice
from PyQt4.QtGui import QColor
from PyQt4.QtGui import QImage
from PyQt4.QtGui import QPainter
from PyQt4.QtSvg import QSvgRenderer
from calibre.ebooks.oeb.base import XHTML, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer

IMAGE_TAGS = set([XHTML('img'), XHTML('object')])
KEEP_ATTRS = set(['class', 'style', 'width', 'height', 'align'])


class Unavailable(Exception):
    pass


class SVGRasterizer(object):
    def __init__(self):
        from calibre.gui2 import is_ok_to_use_qt
        if not is_ok_to_use_qt():
            raise Unavailable('Not OK to use Qt')

    @classmethod
    def config(cls, cfg):
Ejemplo n.º 17
0
 def a(anchor):
     div.append(div.makeelement(XHTML('a'), href='#' + anchor))
     div[-1].text = '\xa0'
     div[-1].tail = ' '
Ejemplo n.º 18
0
import os, re

from PyQt5.Qt import (
    Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer)
from calibre.ebooks.oeb.base import XHTML, XLINK
from calibre.ebooks.oeb.base import SVG_MIME, PNG_MIME
from calibre.ebooks.oeb.base import xml2str, xpath
from calibre.ebooks.oeb.base import urlnormalize
from calibre.ebooks.oeb.stylizer import Stylizer
from calibre.ptempfile import PersistentTemporaryFile
from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type
from polyglot.urllib import urldefrag

IMAGE_TAGS = {XHTML('img'), XHTML('object')}
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}


class Unavailable(Exception):
    pass


class SVGRasterizer(object):

    def __init__(self, base_css=''):
        self.base_css = base_css
        from calibre.gui2 import must_use_qt
        must_use_qt()

    @classmethod
Ejemplo n.º 19
0
    def convert_epub3_nav(self, nav_path, opf, log):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring(
            '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>'
        )
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding=unicode,
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_id = opf.add_path_to_manifest(f.name, NCX_MIME)
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
Ejemplo n.º 20
0
    def convert_epub3_nav(self, nav_path, opf, log, opts):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.ebooks.oeb.polish.parsing import parse
        from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
        from calibre.ebooks.oeb.polish.toc import first_child
        from tempfile import NamedTemporaryFile
        with lopen(nav_path, 'rb') as f:
            raw = f.read()
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                             assume_utf8=True)[0]
        root = parse(raw, log=log)
        ncx = etree.fromstring(
            '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>'
        )
        navmap = ncx[0]
        et = '{%s}type' % EPUB_NS
        bn = os.path.basename(nav_path)

        def add_from_li(li, parent):
            href = text = None
            for x in li.iterchildren(XHTML('a'), XHTML('span')):
                text = etree.tostring(
                    x, method='text', encoding=unicode_type,
                    with_tail=False).strip() or ' '.join(
                        x.xpath('descendant-or-self::*/@title')).strip()
                href = x.get('href')
                if href:
                    if href.startswith('#'):
                        href = bn + href
                break
            np = parent.makeelement(NCX('navPoint'))
            parent.append(np)
            np.append(np.makeelement(NCX('navLabel')))
            np[0].append(np.makeelement(NCX('text')))
            np[0][0].text = text
            if href:
                np.append(np.makeelement(NCX('content'), attrib={'src': href}))
            return np

        def process_nav_node(node, toc_parent):
            for li in node.iterchildren(XHTML('li')):
                child = add_from_li(li, toc_parent)
                ol = first_child(li, XHTML('ol'))
                if child is not None and ol is not None:
                    process_nav_node(ol, child)

        for nav in root.iterdescendants(XHTML('nav')):
            if nav.get(et) == 'toc':
                ol = first_child(nav, XHTML('ol'))
                if ol is not None:
                    process_nav_node(ol, navmap)
                    break
        else:
            return

        with NamedTemporaryFile(suffix='.ncx',
                                dir=os.path.dirname(nav_path),
                                delete=False) as f:
            f.write(etree.tostring(ncx, encoding='utf-8'))
        ncx_href = os.path.relpath(f.name, os.getcwdu()).replace(os.sep, '/')
        ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME,
                                          append=True).get('id')
        for spine in opf.root.xpath('//*[local-name()="spine"]'):
            spine.set('toc', ncx_id)
        opts.epub3_nav_href = urlnormalize(
            os.path.relpath(nav_path).replace(os.sep, '/'))
        opts.epub3_nav_parsed = root
        if getattr(self, 'removed_cover', None):
            changed = False
            base_path = os.path.dirname(nav_path)
            for elem in root.xpath('//*[@href]'):
                href, frag = elem.get('href').partition('#')[::2]
                link_path = os.path.relpath(
                    os.path.join(base_path, urlunquote(href)), base_path)
                abs_href = urlnormalize(link_path)
                if abs_href == self.removed_cover:
                    changed = True
                    elem.set('data-calibre-removed-titlepage', '1')
            if changed:
                with open(nav_path, 'wb') as f:
                    f.write(serialize(root, 'application/xhtml+xml'))
Ejemplo n.º 21
0
 def collapse_li(parent):
     for li in parent.iterdescendants(XHTML('li')):
         if len(li) == 1:
             li.text = None
             li[0].tail = None
Ejemplo n.º 22
0
def merge_html(container, names, master):
    p = container.parsed
    root = p(master)

    # Ensure master has a <head>
    head = root.find('h:head', namespaces=XPNSMAP)
    if head is None:
        head = root.makeelement(XHTML('head'))
        container.insert_into_xml(root, head, 0)

    seen_anchors = all_anchors(root)
    seen_stylesheets = set(all_stylesheets(container, master))
    master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
    master_base = os.path.dirname(master)
    anchor_map = {n: {} for n in names if n != master}

    for name in names:
        if name == master:
            continue
        # Insert new stylesheets into master
        for sheet in all_stylesheets(container, name):
            if sheet not in seen_stylesheets:
                seen_stylesheets.add(sheet)
                link = head.makeelement(XHTML('link'),
                                        rel='stylesheet',
                                        type='text/css',
                                        href=container.name_to_href(
                                            sheet, master))
                container.insert_into_xml(head, link)

        # Rebase links if master is in a different directory
        if os.path.dirname(name) != master_base:
            container.replace_links(name, LinkRebaser(container, name, master))

        root = p(name)
        children = []
        for body in p(name).findall('h:body', namespaces=XPNSMAP):
            children.append(
                body.text if body.text and body.text.strip() else '\n\n')
            children.extend(body)

        first_child = ''
        for first_child in children:
            if not isinstance(first_child, basestring):
                break
        if isinstance(first_child, basestring):
            # body contained only text, no tags
            first_child = body.makeelement(XHTML('p'))
            first_child.text, children[0] = children[0], first_child

        amap = anchor_map[name]
        remove_name_attributes(root)

        for elem in root.xpath('//*[@id]'):
            val = elem.get('id')
            if not val:
                continue
            if val in seen_anchors:
                nval = unique_anchor(seen_anchors, val)
                elem.set('id', nval)
                amap[val] = nval
            else:
                seen_anchors.add(val)

        if 'id' not in first_child.attrib:
            first_child.set('id', unique_anchor(seen_anchors, 'top'))
            seen_anchors.add(first_child.get('id'))

        amap[''] = first_child.get('id')

        # Fix links that point to local changed anchors
        for a in XPath('//h:a[starts-with(@href, "#")]')(root):
            q = a.get('href')[1:]
            if q in amap:
                a.set('href', '#' + amap[q])

        for child in children:
            if isinstance(child, basestring):
                add_text(master_body, child)
            else:
                master_body.append(copy.deepcopy(child))

        container.remove_item(name, remove_from_guide=False)

    # Fix all links in the container that point to merged files
    for fname, media_type in container.mime_map.iteritems():
        repl = MergeLinkReplacer(fname, anchor_map, master, container)
        container.replace_links(fname, repl)
Ejemplo n.º 23
0
    ans['font-family'] = tuple(x.value for x in ff)
    for p in 'weight', 'style', 'stretch':
        p = 'font-' + p
        rp = resolve_property(elem, p) if pseudo is None else resolve_property(
            elem, pseudo, p)
        ans[p] = unicode_type(rp[0].value)
    normalize_font_properties(ans)
    return ans


bad_fonts = {
    'serif', 'sans-serif', 'monospace', 'cursive', 'fantasy', 'sansserif',
    'inherit'
}
exclude_chars = frozenset(ord_string('\n\r\t'))
skip_tags = {XHTML(x) for x in 'script style title meta link'.split()}
font_keys = {'font-weight', 'font-style', 'font-stretch', 'font-family'}


def prepare_font_rule(cssdict):
    cssdict['font-family'] = frozenset(cssdict['font-family'][:1])
    cssdict['width'] = widths[cssdict['font-stretch']]
    cssdict['weight'] = int(cssdict['font-weight'])


class StatsCollector:

    first_letter_pat = capitalize_pat = None

    def __init__(self, container, do_embed=False):
        if self.first_letter_pat is None:
Ejemplo n.º 24
0
def commit_nav_toc(container, toc, lang=None):
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree
    tocname = find_existing_nav_toc(container)
    if tocname is None:
        item = container.generate_item('nav.xhtml', id_prefix='nav')
        item.set('properties', 'nav')
        tocname = container.href_to_name(item.get('href'),
                                         base=container.opf_name)
    try:
        root = container.parsed(tocname)
    except KeyError:
        root = container.parse_xhtml(
            P('templates/new_nav.html', data=True).decode('utf-8'))
    et = '{%s}type' % EPUB_NS
    navs = [
        n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == 'toc'
    ]
    for x in navs[1:]:
        extract(x)
    if navs:
        nav = navs[0]
        tail = nav.tail
        attrib = dict(nav.attrib)
        nav.clear()
        nav.attrib.update(attrib)
        nav.tail = tail
    else:
        nav = root.makeelement(XHTML('nav'))
        first_child(root, XHTML('body')).append(nav)
    nav.set('{%s}type' % EPUB_NS, 'toc')
    if toc.toc_title:
        nav.append(nav.makeelement(XHTML('h1')))
        nav[-1].text = toc.toc_title

    rnode = nav.makeelement(XHTML('ol'))
    nav.append(rnode)
    to_href = partial(container.name_to_href, base=tocname)
    spat = re.compile(r'\s+')

    def process_node(xml_parent, toc_parent):
        for child in toc_parent:
            li = xml_parent.makeelement(XHTML('li'))
            xml_parent.append(li)
            title = child.title or ''
            title = spat.sub(' ', title).strip()
            a = li.makeelement(XHTML('a' if child.dest else 'span'))
            a.text = title
            li.append(a)
            if child.dest:
                href = to_href(child.dest)
                if child.frag:
                    href += '#' + child.frag
                a.set('href', href)
            if len(child):
                ol = li.makeelement(XHTML('ol'))
                li.append(ol)
                process_node(ol, child)

    process_node(rnode, toc)
    pretty_xml_tree(rnode)
    for li in rnode.iterdescendants(XHTML('li')):
        if len(li) == 1:
            li.text = None
            li[0].tail = None
    container.replace(tocname, root)
Ejemplo n.º 25
0
    def mobimlize_elem(self,
                       elem,
                       stylizer,
                       bstate,
                       istates,
                       ignore_valign=False):
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = XHTML('a')
            else:
                return
        tag = barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start']) - 1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline')
                   and style['display'] != 'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            istate.indent = style['text-indent']
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = (u'\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + (u'\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + (u'\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = (style['white-space'] in ('pre', 'pre-wrap'))
        istate.bgcolor = style['background-color']
        istate.fgcolor = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if style['font-family'] else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff
              or 'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(
                                round(float(value) / (72. / self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:', href)
                else:
                    try:
                        width, height = identify_data(item.data)[:2]
                    except:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = float(width) / float(height)
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height']) * ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width']) / ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0:
            prop = style['width'] / self.profile.width
            istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                         'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = u'\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = u'\u201d' + t
        text = None
        if elem.text:
            if istate.preserve:
                text = elem.text
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom',
                                  'top', 'bottom') or (isinstance(
                                      valign,
                                      (float, int)) and abs(valign) != 0)
        issup = valign in ('super', 'text-top',
                           'top') or (isinstance(valign,
                                                 (float, int)) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
            vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
            self.mobimlize_elem(elem,
                                stylizer,
                                vbstate,
                                istates,
                                ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, XHTML(vtag))
                vtag = etree.SubElement(vtag, XHTML('small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u')
                and len(elem) == 0)):
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == u'\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(XHTML('br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
Ejemplo n.º 26
0
def render_jacket(mi,
                  output_profile,
                  alt_title=_('Unknown'),
                  alt_tags=[],
                  alt_comments='',
                  alt_publisher='',
                  rescale_fonts=False,
                  alt_authors=None):
    css = P('jacket/stylesheet.css', data=True).decode('utf-8')
    template = P('jacket/template.xhtml', data=True).decode('utf-8')

    template = re.sub(r'<!--.*?-->', '', template, flags=re.DOTALL)
    css = re.sub(r'/\*.*?\*/', '', css, flags=re.DOTALL)

    try:
        title_str = alt_title if mi.is_null('title') else mi.title
    except:
        title_str = _('Unknown')
    title_str = escape(title_str)
    title = '<span class="title">%s</span>' % title_str

    series = Series(mi.series, mi.series_index)
    try:
        publisher = mi.publisher if not mi.is_null(
            'publisher') else alt_publisher
    except:
        publisher = ''
    publisher = escape(publisher)

    try:
        if is_date_undefined(mi.pubdate):
            pubdate = ''
        else:
            dt = as_local_time(mi.pubdate)
            pubdate = strftime('%Y', dt.timetuple())
    except:
        pubdate = ''

    rating = get_rating(mi.rating, output_profile.ratings_char,
                        output_profile.empty_ratings_char)

    tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)

    comments = mi.comments if mi.comments else alt_comments
    comments = comments.strip()
    if comments:
        comments = comments_to_html(comments)

    orig = mi.authors
    if mi.is_null('authors'):
        mi.authors = list(alt_authors or (_('Unknown'), ))
    try:
        author = mi.format_authors()
    except:
        author = ''
    mi.authors = orig
    author = escape(author)
    has_data = {}

    def generate_html(comments):
        args = dict(
            xmlns=XHTML_NS,
            title_str=title_str,
            css=css,
            title=title,
            author=author,
            publisher=publisher,
            pubdate_label=_('Published'),
            pubdate=pubdate,
            series_label=ngettext('Series', 'Series', 1),
            series=series,
            rating_label=_('Rating'),
            rating=rating,
            tags_label=_('Tags'),
            tags=tags,
            comments=comments,
            footer='',
            searchable_tags=' '.join(
                escape(t) + 'ttt' for t in tags.tags_list),
        )
        for key in mi.custom_field_keys():
            m = mi.get_user_metadata(key, False) or {}
            try:
                display_name, val = mi.format_field_extended(key)[:2]
                dkey = key.replace('#', '_')
                dt = m.get('datatype')
                if dt == 'series':
                    args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
                elif dt == 'rating':
                    args[dkey] = rating_to_stars(
                        mi.get(key),
                        m.get('display', {}).get('allow_half_stars', False))
                elif dt == 'comments':
                    val = val or ''
                    display = m.get('display', {})
                    ctype = display.get('interpret_as') or 'html'
                    if ctype == 'long-text':
                        val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(
                            val)
                    elif ctype == 'short-text':
                        val = '<span>%s</span>' % escape(val)
                    elif ctype == 'markdown':
                        val = markdown(val)
                    else:
                        val = comments_to_html(val)
                    args[dkey] = val
                else:
                    args[dkey] = escape(val)
                args[dkey + '_label'] = escape(display_name)
            except Exception:
                # if the val (custom column contents) is None, don't add to args
                pass

        if False:
            print("Custom column values available in jacket template:")
            for key in args.keys():
                if key.startswith('_') and not key.endswith('_label'):
                    print(" %s: %s" % ('#' + key[1:], args[key]))

        # Used in the comment describing use of custom columns in templates
        # Don't change this unless you also change it in template.xhtml
        args['_genre_label'] = args.get('_genre_label', '{_genre_label}')
        args['_genre'] = args.get('_genre', '{_genre}')

        formatter = SafeFormatter()
        generated_html = formatter.format(template, **args)
        has_data['series'] = bool(series)
        has_data['tags'] = bool(tags)
        has_data['rating'] = bool(rating)
        has_data['pubdate'] = bool(pubdate)

        return strip_encoding_declarations(generated_html)

    from calibre.ebooks.oeb.polish.parsing import parse
    raw = generate_html(comments)
    root = parse(raw, line_numbers=False, force_html5_parse=True)

    if rescale_fonts:
        # We ensure that the conversion pipeline will set the font sizes for
        # text in the jacket to the same size as the font sizes for the rest of
        # the text in the book. That means that as long as the jacket uses
        # relative font sizes (em or %), the post conversion font size will be
        # the same as for text in the main book. So text with size x em will
        # be rescaled to the same value in both the jacket and the main content.
        #
        # We cannot use data-calibre-rescale 100 on the body tag as that will just
        # give the body tag a font size of 1em, which is useless.
        for body in root.xpath('//*[local-name()="body"]'):
            fw = body.makeelement(XHTML('div'))
            fw.set('data-calibre-rescale', '100')
            for child in body:
                fw.append(child)
            body.append(fw)
    postprocess_jacket(root, output_profile, has_data)
    from calibre.ebooks.oeb.polish.pretty import pretty_html_tree
    pretty_html_tree(None, root)
    return root
Ejemplo n.º 27
0
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None):
    from calibre.ebooks.oeb.polish.pretty import pretty_xml_tree
    tocname = find_existing_nav_toc(container)
    if previous_nav is not None:
        nav_name = container.href_to_name(previous_nav[0])
        if nav_name and container.exists(nav_name):
            tocname = nav_name
            container.apply_unique_properties(tocname, 'nav')
    if tocname is None:
        item = container.generate_item('nav.xhtml', id_prefix='nav')
        item.set('properties', 'nav')
        tocname = container.href_to_name(item.get('href'), base=container.opf_name)
        if previous_nav is not None:
            root = previous_nav[1]
        else:
            root = container.parse_xhtml(P('templates/new_nav.html', data=True).decode('utf-8'))
        container.replace(tocname, root)
    else:
        root = container.parsed(tocname)
    if lang:
        lang = lang_as_iso639_1(lang) or lang
        root.set('lang', lang)
        root.set('{%s}lang' % XML_NS, lang)
    nav = ensure_single_nav_of_type(root, 'toc')
    if toc.toc_title:
        nav.append(nav.makeelement(XHTML('h1')))
        nav[-1].text = toc.toc_title

    rnode = nav.makeelement(XHTML('ol'))
    nav.append(rnode)
    to_href = partial(container.name_to_href, base=tocname)
    spat = re.compile(r'\s+')

    def process_node(xml_parent, toc_parent):
        for child in toc_parent:
            li = xml_parent.makeelement(XHTML('li'))
            xml_parent.append(li)
            title = child.title or ''
            title = spat.sub(' ', title).strip()
            a = li.makeelement(XHTML('a' if child.dest else 'span'))
            a.text = title
            li.append(a)
            if child.dest:
                href = to_href(child.dest)
                if child.frag:
                    href += '#'+child.frag
                a.set('href', href)
            if len(child):
                ol = li.makeelement(XHTML('ol'))
                li.append(ol)
                process_node(ol, child)
    process_node(rnode, toc)
    pretty_xml_tree(nav)

    def collapse_li(parent):
        for li in parent.iterdescendants(XHTML('li')):
            if len(li) == 1:
                li.text = None
                li[0].tail = None
    collapse_li(nav)
    nav.tail = '\n'

    def create_li(ol, entry):
        li = ol.makeelement(XHTML('li'))
        ol.append(li)
        a = li.makeelement(XHTML('a'))
        li.append(a)
        href = container.name_to_href(entry['dest'], tocname)
        if entry['frag']:
            href += '#' + entry['frag']
        a.set('href', href)
        return a

    if landmarks is not None:
        nav = ensure_single_nav_of_type(root, 'landmarks')
        nav.set('hidden', '')
        ol = nav.makeelement(XHTML('ol'))
        nav.append(ol)
        for entry in landmarks:
            if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
                a = create_li(ol, entry)
                a.set('{%s}type' % EPUB_NS, entry['type'])
                a.text = entry['title'] or None
        pretty_xml_tree(nav)
        collapse_li(nav)

    if toc.page_list:
        nav = ensure_single_nav_of_type(root, 'page-list')
        nav.set('hidden', '')
        ol = nav.makeelement(XHTML('ol'))
        nav.append(ol)
        for entry in toc.page_list:
            if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
                a = create_li(ol, entry)
                a.text = unicode_type(entry['pagenum'])
        pretty_xml_tree(nav)
        collapse_li(nav)
    container.replace(tocname, root)
Ejemplo n.º 28
0
    def __init__(self,
                 tree,
                 path,
                 oeb,
                 opts,
                 profile=None,
                 extra_css='',
                 user_css=''):
        self.oeb, self.opts = oeb, opts
        self.profile = profile
        if self.profile is None:
            self.profile = opts.output_profile
        self.logger = oeb.logger
        item = oeb.manifest.hrefs[path]
        basename = os.path.basename(path)
        cssname = os.path.splitext(basename)[0] + '.css'
        stylesheets = [html_css_stylesheet()]
        head = xpath(tree, '/h:html/h:head')
        if head:
            head = head[0]
        else:
            head = []

        # Add cssutils parsing profiles from output_profile
        for profile in self.opts.output_profile.extra_css_modules:
            cssprofiles.addProfile(profile['name'], profile['props'],
                                   profile['macros'])

        parser = CSSParser(fetcher=self._fetch_css_file,
                           log=logging.getLogger('calibre.css'))
        self.font_face_rules = []
        for elem in head:
            if (elem.tag == XHTML('style')
                    and elem.get('type', CSS_MIME) in OEB_STYLES):
                text = elem.text if elem.text else u''
                for x in elem:
                    t = getattr(x, 'text', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                    t = getattr(x, 'tail', None)
                    if t:
                        text += u'\n\n' + force_unicode(t, u'utf-8')
                if text:
                    text = oeb.css_preprocessor(text, add_namespace=False)
                    # We handle @import rules separately
                    parser.setFetcher(lambda x: ('utf-8', b''))
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    parser.setFetcher(self._fetch_css_file)
                    #stylesheet.namespaces['h'] = XHTML_NS
                    for rule in stylesheet.cssRules:
                        if rule.type == rule.IMPORT_RULE:
                            ihref = item.abshref(rule.href)
                            if rule.media.mediaText == 'amzn-mobi':
                                continue
                            hrefs = self.oeb.manifest.hrefs
                            if ihref not in hrefs:
                                self.logger.warn(
                                    'Ignoring missing stylesheet in @import rule:',
                                    rule.href)
                                continue
                            sitem = hrefs[ihref]
                            if sitem.media_type not in OEB_STYLES:
                                self.logger.warn(
                                    'CSS @import of non-CSS file %r' %
                                    rule.href)
                                continue
                            stylesheets.append(sitem.data)
                    # Make links to resources absolute, since these rules will
                    # be folded into a stylesheet at the root
                    replaceUrls(stylesheet,
                                item.abshref,
                                ignoreImportRules=True)
                    stylesheets.append(stylesheet)
            elif elem.tag == XHTML('link') and elem.get('href') \
                 and elem.get('rel', 'stylesheet').lower() == 'stylesheet' \
                 and elem.get('type', CSS_MIME).lower() in OEB_STYLES:
                href = urlnormalize(elem.attrib['href'])
                path = item.abshref(href)
                sitem = oeb.manifest.hrefs.get(path, None)
                if sitem is None:
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r not in manifest' %
                        (path, item.href))
                    continue
                if not hasattr(sitem.data, 'cssRules'):
                    self.logger.warn(
                        'Stylesheet %r referenced by file %r is not CSS' %
                        (path, item.href))
                    continue
                stylesheets.append(sitem.data)
        csses = {'extra_css': extra_css, 'user_css': user_css}
        for w, x in csses.items():
            if x:
                try:
                    text = XHTML_CSS_NAMESPACE + x
                    stylesheet = parser.parseString(text,
                                                    href=cssname,
                                                    validate=False)
                    #stylesheet.namespaces['h'] = XHTML_NS
                    stylesheets.append(stylesheet)
                except:
                    self.logger.exception('Failed to parse %s, ignoring.' % w)
                    self.logger.debug('Bad css: ')
                    self.logger.debug(x)
        rules = []
        index = 0
        self.stylesheets = set()
        self.page_rule = {}
        for stylesheet in stylesheets:
            href = stylesheet.href
            self.stylesheets.add(href)
            for rule in stylesheet.cssRules:
                if rule.type == rule.MEDIA_RULE:
                    media = {
                        rule.media.item(i)
                        for i in xrange(rule.media.length)
                    }
                    if not media.intersection({'all', 'screen', 'amzn-kf8'}):
                        continue
                    for subrule in rule.cssRules:
                        rules.extend(self.flatten_rule(subrule, href, index))
                        index += 1
                else:
                    rules.extend(self.flatten_rule(rule, href, index))
                    index = index + 1
        rules.sort()
        self.rules = rules
        self._styles = {}
        pseudo_pat = re.compile(
            ur':(first-letter|first-line|link|hover|visited|active|focus|before|after)',
            re.I)
        for _, _, cssdict, text, _ in rules:
            fl = pseudo_pat.search(text)
            if fl is not None:
                text = text.replace(fl.group(), '')
            selector = get_css_selector(text, self.oeb.log)
            matches = selector(tree, self.logger)
            if fl is not None:
                fl = fl.group(1)
                if fl == 'first-letter' and getattr(self.oeb,
                                                    'plumber_output_format',
                                                    '').lower() == u'mobi':
                    # Fake first-letter
                    from lxml.builder import ElementMaker
                    E = ElementMaker(namespace=XHTML_NS)
                    for elem in matches:
                        for x in elem.iter():
                            if x.text:
                                punctuation_chars = []
                                text = unicode(x.text)
                                while text:
                                    category = unicodedata.category(text[0])
                                    if category[0] not in {'P', 'Z'}:
                                        break
                                    punctuation_chars.append(text[0])
                                    text = text[1:]

                                special_text = u''.join(punctuation_chars) + \
                                        (text[0] if text else u'')
                                span = E.span(special_text)
                                span.tail = text[1:]
                                x.text = None
                                x.insert(0, span)
                                self.style(span)._update_cssdict(cssdict)
                                break
                else:  # Element pseudo-class
                    for elem in matches:
                        self.style(elem)._update_pseudo_class(fl, cssdict)
            else:
                for elem in matches:
                    self.style(elem)._update_cssdict(cssdict)
        for elem in xpath(tree, '//h:*[@style]'):
            self.style(elem)._apply_style_attr(url_replacer=item.abshref)
        num_pat = re.compile(r'\d+$')
        for elem in xpath(tree, '//h:img[@width or @height]'):
            style = self.style(elem)
            # Check if either height or width is not default
            is_styled = style._style.get('width', 'auto') != 'auto' or \
                    style._style.get('height', 'auto') != 'auto'
            if not is_styled:
                # Update img style dimension using width and height
                upd = {}
                for prop in ('width', 'height'):
                    val = elem.get(prop, '').strip()
                    try:
                        del elem.attrib[prop]
                    except:
                        pass
                    if val:
                        if num_pat.match(val) is not None:
                            val += 'px'
                        upd[prop] = val
                if upd:
                    style._update_cssdict(upd)