Exemple #1
0
 def _clean_opf(self, opf):
     nsmap = {}
     for elem in opf.iter(tag=etree.Element):
         nsmap.update(elem.nsmap)
     for elem in opf.iter(tag=etree.Element):
         if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
             elem.tag = OPF(barename(elem.tag))
     nsmap.update(OPF2_NSMAP)
     attrib = dict(opf.attrib)
     nroot = etree.Element(OPF('package'),
         nsmap={None: OPF2_NS}, attrib=attrib)
     metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
     ignored = (OPF('dc-metadata'), OPF('x-metadata'))
     for elem in xpath(opf, 'o2:metadata//*'):
         if elem.tag in ignored:
             continue
         if namespace(elem.tag) in DC_NSES:
             tag = barename(elem.tag).lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         if elem.tag.startswith('dc:'):
             tag = elem.tag.partition(':')[-1].lower()
             elem.tag = '{%s}%s' % (DC11_NS, tag)
         metadata.append(elem)
     for element in xpath(opf, 'o2:metadata//o2:meta'):
         metadata.append(element)
     for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
         for element in xpath(opf, tag):
             nroot.append(element)
     return nroot
Exemple #2
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Remove attributes we won't want.
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #3
0
 def serialize_elem(self, elem, item, nsrmap=NSRMAP):
     buf = self.buf
     if not isinstance(elem.tag, basestring) \
         or namespace(elem.tag) not in nsrmap:
             return
     tag = prefixname(elem.tag, nsrmap)
     # Previous layers take care of @name
     id_ = elem.attrib.pop('id', None)
     if id_:
         href = '#'.join((item.href, id_))
         offset = self.anchor_offset or buf.tell()
         key = urlnormalize(href)
         # Only set this id_offset if it wasn't previously seen
         self.id_offsets[key] = self.id_offsets.get(key, offset)
     if self.anchor_offset is not None and \
         tag == 'a' and not elem.attrib and \
         not len(elem) and not elem.text:
             return
     self.anchor_offset = buf.tell()
     buf.write(b'<')
     buf.write(tag.encode('utf-8'))
     if elem.attrib:
         for attr, val in elem.attrib.items():
             if namespace(attr) not in nsrmap:
                 continue
             attr = prefixname(attr, nsrmap)
             buf.write(b' ')
             if attr == 'href':
                 if self.serialize_href(val, item):
                     continue
             elif attr == 'src':
                 href = urlnormalize(item.abshref(val))
                 if href in self.images:
                     index = self.images[href]
                     self.used_images.add(href)
                     buf.write(b'recindex="%05d"' % index)
                     continue
             buf.write(attr.encode('utf-8'))
             buf.write(b'="')
             self.serialize_text(val, quot=True)
             buf.write(b'"')
     buf.write(b'>')
     if elem.text or len(elem) > 0:
         if elem.text:
             self.anchor_offset = None
             self.serialize_text(elem.text)
         for child in elem:
             self.serialize_elem(child, item)
             if child.tail:
                 self.anchor_offset = None
                 self.serialize_text(child.tail)
     buf.write(b'</%s>' % tag.encode('utf-8'))
Exemple #4
0
    def _read_opf(self):
        data = self.oeb.container.read(None)
        data = self.oeb.decode(data)
        data = XMLDECL_RE.sub('', data)
        data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
                OPF1_NS, data)
        try:
            opf = etree.fromstring(data)
        except etree.XMLSyntaxError:
            data = xml_replace_entities(clean_xml_chars(data), encoding=None)
            try:
                opf = etree.fromstring(data)
                self.logger.warn('OPF contains invalid HTML named entities')
            except etree.XMLSyntaxError:
                data = re.sub(r'(?is)<tours>.+</tours>', '', data)
                data = data.replace('<dc-metadata>',
                    '<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
                try:
                    opf = etree.fromstring(data)
                    self.logger.warn('OPF contains invalid tours section')
                except etree.XMLSyntaxError:
                    from calibre.ebooks.oeb.parse_utils import RECOVER_PARSER
                    opf = etree.fromstring(data, parser=RECOVER_PARSER)
                    self.logger.warn('OPF contains invalid markup, trying to parse it anyway')

        ns = namespace(opf.tag)
        if ns not in ('', OPF1_NS, OPF2_NS):
            raise OEBError('Invalid namespace %r for OPF document' % ns)
        opf = self._clean_opf(opf)
        return opf
Exemple #5
0
 def mangle_elem(self, elem, stylizer):
     if not isinstance(elem.tag, basestring) or \
        namespace(elem.tag) != XHTML_NS:
         return
     children = list(elem)
     style = stylizer.style(elem)
     transform = style['text-transform']
     variant = style['font-variant']
     if elem.text:
         if transform in TEXT_TRANSFORMS:
             elem.text = self.text_transform(transform, elem.text)
         if variant == 'small-caps':
             self.smallcaps_elem(elem, 'text')
     for child in children:
         self.mangle_elem(child, stylizer)
         if child.tail:
             if transform in TEXT_TRANSFORMS:
                 child.tail = self.text_transform(transform, child.tail)
             if variant == 'small-caps':
                 self.smallcaps_elem(child, 'tail')
Exemple #6
0
    def dump_text(self, subitems, elem, stylizer, end='', pre=False, li=''):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if elem.attrib.get('id') is not None and elem.attrib['id'] in [href for href, title in subitems]:
            if self.curSubItem is not None and self.curSubItem != elem.attrib['id']:
                self.curSubItem = elem.attrib['id']
                text.append(u'\n\n%s%s\n\n' % (CALIBRE_SNB_BM_TAG, self.curSubItem))

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = barename(elem.tag)
        in_block = False

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            in_block = True
            if not end.endswith(u'\n\n') and hasattr(elem, 'text') and elem.text:
                text.append(u'\n\n')

        if tag in SPACE_TAGS:
            if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
                text.append(u' ')

        if tag == 'img':
            text.append(u'\n\n%s%s\n\n' % (CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))

        if tag == 'br':
            text.append(u'\n\n')

        if tag == 'li':
            li = '- '

        pre = (tag == 'pre' or pre)
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            if pre:
                text.append((u'\n\n%s' % CALIBRE_SNB_PRE_TAG).join((li + elem.text).splitlines()))
            else:
                text.append(li + elem.text)
            li = ''

        for item in elem:
            en = u''
            if len(text) >= 2:
                en = text[-1][-2:]
            t = self.dump_text(subitems, item, stylizer, en, pre, li)[0]
            text += t

        if in_block:
            text.append(u'\n\n')

        if hasattr(elem, 'tail') and elem.tail:
            if pre:
                text.append((u'\n\n%s' % CALIBRE_SNB_PRE_TAG).join(elem.tail.splitlines()))
            else:
                text.append(li + elem.tail)
            li = ''

        return text, li
Exemple #7
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = []
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n' * ems)

        bq = '> ' * self.blockquotes
        # Block level elements
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            h_tag = ''
            if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                h_tag = '#' * int(tag[1]) + ' '
            text.append('\n' + bq + h_tag)
            tags.append('\n')
            self.remove_space_after_newline = True

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:  # noqa
                    text.append('*')
                    tags.append('*')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b',
                                                                 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:  # noqa
                    text.append('**')
                    tags.append('**')
                    self.style_bold = True
        if tag == 'br':
            text.append('  \n')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            self.blockquotes += 1
            tags.append('>')
            text.append('> ' * self.blockquotes)
        elif tag == 'code':
            if not self.in_pre and not self.in_code:
                text.append('`')
                tags.append('`')
                self.in_code = True
        elif tag == 'pre':
            if not self.in_pre:
                text.append('\n')
                tags.append('pre')
                self.in_pre = True
        elif tag == 'hr':
            text.append('\n* * *')
            tags.append('\n')
        elif tag == 'a':
            # Only write links with absolute (external) urls.
            if self.opts.keep_links and attribs.has_key(
                    'href') and '://' in attribs['href']:  # noqa
                title = ''
                if attribs.has_key('title'):  # noqa
                    title = ' "' + attribs['title'] + '"'
                    remove_space = self.remove_space_after_newline
                    title = self.remove_newlines(title)
                    self.remove_space_after_newline = remove_space
                text.append('[')
                tags.append('](' + attribs['href'] + title + ')')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!'
                if attribs.has_key('alt'):  # noqa
                    remove_space = self.remove_space_after_newline
                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
                    self.remove_space_after_newline = remove_space
                txt += '(' + attribs['src'] + ')'
                text.append(txt)
        elif tag in ('ol', 'ul'):
            tags.append(tag)
            # Add the list to our lists of lists so we can track
            # nested lists.
            self.list.append({'name': tag, 'num': 0})
        elif tag == 'li':
            # Get the last list from our list of lists
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            # Add a new line to start the item
            text.append('\n')
            # Add indent if we have nested lists.
            list_count = len(self.list)
            # We only care about indenting nested lists.
            if (list_count - 1) > 0:
                text.append('\t' * (list_count - 1))
            # Add blockquote if we have a blockquote in a list item.
            text.append(bq)
            # Write the proper sign for ordered and unorded lists.
            if li['name'] == 'ul':
                text.append('+ ')
            elif li['name'] == 'ol':
                li['num'] += 1
                text.append(unicode(li['num']) + '. ')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if self.in_pre:
                txt = self.prepare_string_for_pre(txt)
            elif self.in_code:
                txt = self.remove_newlines(txt)
            else:
                txt = self.prepare_string_for_markdown(
                    self.remove_newlines(txt))
            text.append(txt)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', '>'):
                if t == 'pre':
                    self.in_pre = False
                    text.append('\n')
                elif t == '>':
                    self.blockquotes -= 1
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    text.append('\n')
            else:
                if t == '**':
                    self.style_bold = False
                elif t == '*':
                    self.style_italic = False
                elif t == '`':
                    self.in_code = False
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict(
        ) and style['margin-bottom'] != 'auto':
            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
            if ems >= 1:
                text.append(u'\n\n' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if self.in_pre:
                tail = self.prepare_string_for_pre(tail)
            elif self.in_code:
                tail = self.remove_newlines(tail)
            else:
                tail = self.prepare_string_for_markdown(
                    self.remove_newlines(tail))
            text.append(tail)

        return text
Exemple #8
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        style_a = '%s' % style
        style_a = style_a if style_a else ''
        if tag == 'body':
            # Change the body to a div so we can merge multiple files.
            tag = 'div'
            # Add page-break-brefore: always because renders typically treat a new file (we're merging files)
            # as a page break and remove all other page break types that might be set.
            style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
        # Remove unnecessary spaces.
        style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
        tags.append(tag)

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Turn style into strings for putting in the tag.
        style_t = ''
        if style_a:
            style_t = ' style="%s"' % style_a.replace('"', "'")

        # Write the tag.
        text.append('<%s%s%s' % (tag, at, style_t))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #9
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return []

        text = []
        tags = []
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return []

        tag = barename(elem.tag)

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            tags.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument.
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(elem.attrib['src']) not in self.image_hrefs.keys():
                    if len(self.image_hrefs.keys()) == 0:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = 'cover.png'
                    else:
                        self.image_hrefs[page.abshref(elem.attrib['src'])] = image_name('%s.png' % len(self.image_hrefs.keys()), self.image_hrefs.keys()).strip('\x00')
                text.append('\\m="%s"' % self.image_hrefs[page.abshref(elem.attrib['src'])])
        elif tag == 'hr':
            w = '\\w'
            width = elem.get('width')
            if width:
                if not width.endswith('%'):
                    width += '%'
                w += '="%s"' % width
            else:
                w += '="50%"'
            text.append(w)
        elif tag == 'br':
            text.append('\n\\c \n\\c\n')

        # TOC markers.
        toc_name = elem.attrib.get('name', None)
        toc_id = elem.attrib.get('id', None)
        # Only write the TOC marker if the tag isn't a heading and we aren't in one.
        if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \
            'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \
            'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \
            'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags:

            toc_page = page.href
            if self.toc.get(toc_page, None):
                for toc_x in (toc_name, toc_id):
                    toc_title, toc_depth = self.toc[toc_page].get(toc_x, (None, 0))
                    if toc_title:
                        toc_depth = max(min(toc_depth, 4), 0)
                        text.append('\\C%s="%s"' % (toc_depth, toc_title))

        # Process style information that needs holds a single tag.
        # Commented out because every page in an OEB book starts with this style.
        if style['page-break-before'] == 'always':
            text.append('\\p')

        # Process basic PML tags.
        pml_tag = TAG_MAP.get(tag, None)
        if pml_tag and pml_tag not in tag_stack+tags:
            text.append('\\%s' % pml_tag)
            tags.append(pml_tag)

        # Special processing of tags that require an argument.
        # Anchors links
        if tag in LINK_TAGS and 'q' not in tag_stack+tags:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
                    href = '#%s' % self.link_hrefs[href]
                    text.append('\\q="%s"' % href)
                    tags.append('q')

        # Anchor ids
        id_name = elem.get('id')
        name_name = elem.get('name')
        for name_x in (id_name, name_name):
            if name_x:
                text.append(self.get_anchor(page, name_x))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack+tags:
                text.append('\\%s' % style_tag)
                tags.append(style_tag)

        # margin left
        try:
            mms = int(float(style['margin-left']) * 100 / style.height)
            if mms:
                text.append('\\T="%s%%"' % mms)
        except:
            pass

        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n\\c \n\\c\n')
        except:
            pass

        # Proccess text within this tag.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_pml(elem.text))

        # Process inner tags
        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack+tags)

        # Close opened tags.
        tags.reverse()
        text += self.close_tags(tags)

        #if tag in SEPARATE_TAGS:
        #    text.append('\n\n')

        if style['page-break-after'] == 'always':
            text.append('\\p')

        # Process text after this tag but not within another.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_pml(elem.tail))

        return text
Exemple #10
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = []
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib
        
        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n' * ems)

        bq = '> ' * self.blockquotes
        # Block level elements
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            h_tag = ''
            if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                h_tag = '#' * int(tag[1]) + ' '
            text.append('\n' + bq + h_tag)
            tags.append('\n')
            self.remove_space_after_newline = True

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:
                    text.append('*')
                    tags.append('*')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:
                    text.append('**')
                    tags.append('**')
                    self.style_bold = True
        if tag == 'br':
            text.append('  \n')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            self.blockquotes += 1
            tags.append('>')
            text.append('> ' * self.blockquotes)
        elif tag == 'code':
            if not self.in_pre and not self.in_code:
                text.append('`')
                tags.append('`')
                self.in_code = True
        elif tag == 'pre':
            if not self.in_pre:
                text.append('\n')
                tags.append('pre')
                self.in_pre = True
        elif tag == 'hr':
            text.append('\n* * *')
            tags.append('\n')
        elif tag == 'a':
            # Only write links with absolute (external) urls.
            if self.opts.keep_links and attribs.has_key('href') and '://' in attribs['href']:
                title = ''
                if attribs.has_key('title'):
                    title = ' "' + attribs['title'] + '"'
                    remove_space = self.remove_space_after_newline
                    title = self.remove_newlines(title)
                    self.remove_space_after_newline = remove_space
                text.append('[')
                tags.append('](' + attribs['href'] + title + ')')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!'
                if attribs.has_key('alt'):
                    remove_space = self.remove_space_after_newline
                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
                    self.remove_space_after_newline = remove_space
                txt += '(' + attribs['src'] + ')'
                text.append(txt)
        elif tag in ('ol', 'ul'):
            tags.append(tag)
            # Add the list to our lists of lists so we can track
            # nested lists.
            self.list.append({'name': tag, 'num': 0})
        elif tag == 'li':
            # Get the last list from our list of lists
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            # Add a new line to start the item
            text.append('\n')
            # Add indent if we have nested lists.
            list_count = len(self.list)
            # We only care about indenting nested lists.
            if (list_count - 1) > 0:
                text.append('\t' * (list_count - 1))
            # Add blockquote if we have a blockquote in a list item.
            text.append(bq)
            # Write the proper sign for ordered and unorded lists.
            if li['name'] == 'ul':
                text.append('+ ')
            elif li['name'] == 'ol':
                li['num'] += 1
                text.append(unicode(li['num']) + '. ')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if self.in_pre:
                txt = self.prepare_string_for_pre(txt)
            elif self.in_code:
                txt = self.remove_newlines(txt)
            else:
                txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
            text.append(txt)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', '>'):
                if t == 'pre':
                    self.in_pre = False
                    text.append('\n')
                elif t == '>':
                    self.blockquotes -= 1
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    text.append('\n')
            else:
                if t == '**':
                    self.style_bold = False
                elif t == '*':
                    self.style_italic = False
                elif t == '`':
                    self.in_code = False
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
            if ems >= 1:
                text.append(u'\n\n' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if self.in_pre:
                tail = self.prepare_string_for_pre(tail)
            elif self.in_code:
                tail = self.remove_newlines(tail)
            else:
                tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
            text.append(tail)

        return text
Exemple #11
0
    def mobimlize_elem(self, elem, stylizer, bstate, istates,
            ignore_valign=False):
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = XHTML('a')
            else:
                return
        tag = barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start'])-1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline') and style['display'] !=
                'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            istate.indent = style['text-indent']
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style['margin-top'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style['margin-left'])
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = (u'\xa0' * spaces) + (elem.text or '')
            margin = asfloat(style['margin-right'])
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + (u'\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + (u'\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = (style['white-space'] in ('pre', 'pre-wrap'))
        istate.bgcolor  = style['background-color']
        istate.fgcolor  = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if style['font-family'] else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff or
                'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(round(float(value) / \
                                (72./self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:',
                            href)
                else:
                    try:
                        width, height = identify_data(item.data)[:2]
                    except:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = float(width)/float(height)
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height'])*ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width'])/ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0:
            prop = style['width'] / self.profile.width
            istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                    'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = u'\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = u'\u201d' + t
        text = None
        if elem.text:
            if istate.preserve:
                text = elem.text
            elif len(elem) > 0 and isspace(elem.text):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top',
                'text-bottom', 'top', 'bottom') or (
                isinstance(valign, (float, int)) and abs(valign) != 0)
        issup = valign in ('super', 'text-top', 'top') or (
            isinstance(valign, (float, int)) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
            vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
            self.mobimlize_elem(elem, stylizer, vbstate, istates,
                    ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, XHTML(vtag))
                vtag = etree.SubElement(vtag, XHTML('small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
            # We have an id but no text and no children, the id should still
            # be added.
            istate.ids and tag in ('a', 'span', 'i', 'b', 'u') and
            len(elem)==0)):
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == u'\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(XHTML('br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style['margin-bottom'])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
Exemple #12
0
    def dump_text(self, elem, stylizer, tag_stack=[]):
        from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename,
                                             urlnormalize)

        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return elem.tail
            return ''

        text = ''
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return elem.tail
            return ''

        tag = barename(elem.tag)
        tag_count = 0

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if 'block' not in tag_stack:
                tag_count += 1
                tag_stack.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == 'img':
            src = elem.get('src')
            if src:
                src = urlnormalize(self.currently_dumping_item.abshref(src))
                block_start = ''
                block_end = ''
                if 'block' not in tag_stack:
                    block_start = r'{\par\pard\hyphpar '
                    block_end = '}'
                text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start,
                                                               src, block_end)

        single_tag = SINGLE_TAGS.get(tag, None)
        if single_tag:
            text += single_tag

        rtf_tag = TAGS.get(tag, None)
        if rtf_tag and rtf_tag not in tag_stack:
            tag_count += 1
            text += '{%s\n' % rtf_tag
            tag_stack.append(rtf_tag)

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += '{%s\n' % style_tag
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text += txt2rtf(elem.text)

        for item in elem:
            text += self.dump_text(item, stylizer, tag_stack)

        for i in range(0, tag_count):
            end_tag = tag_stack.pop()
            if end_tag != 'block':
                if tag in BLOCK_TAGS:
                    text += r'\par\pard\plain\hyphpar}'
                else:
                    text += '}'

        if hasattr(elem, 'tail') and elem.tail:
            if 'block' in tag_stack:
                text += '%s' % txt2rtf(elem.tail)
            else:
                text += r'{\par\pard\hyphpar %s}' % txt2rtf(elem.tail)

        return text
Exemple #13
0
    def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False):
        if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if style["display"] in ("none", "oeb-page-head", "oeb-page-foot") or style["visibility"] == "hidden":
            id_ = elem.get("id", None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set("id", id_)
                elem.tail = tail
                elem.tag = XHTML("a")
            else:
                return
        tag = barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == "ol" and "start" in elem.attrib:
            try:
                istate.list_num = int(elem.attrib["start"]) - 1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style["display"]
        if display == "table-cell":
            display = "inline"
        elif display.startswith("table"):
            display = "block"
        isblock = not display.startswith("inline") and style["display"] != "none"
        isblock = isblock and style["float"] == "none"
        isblock = isblock and tag != "br"
        if isblock:
            bstate.para = None
            istate.halign = style["text-align"]
            rawti = style._get("text-indent")
            istate.indent = style["text-indent"]
            if hasattr(rawti, "strip") and "%" in rawti:
                # We have a percentage text indent, these can come out looking
                # too large if the user chooses a wide output profile like
                # tablet
                istate.indent = min(style._unit_convert(rawti, base=500), istate.indent)
            if style["margin-left"] == "auto" and style["margin-right"] == "auto":
                istate.halign = "center"
            margin = asfloat(style["margin-left"])
            padding = asfloat(style["padding-left"])
            if tag != "body":
                left = margin + padding
            istate.left += left
            vmargin = asfloat(style["margin-top"])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style["padding-top"])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = asfloat(style["margin-left"])
            padding = asfloat(style["padding-left"])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style["font-size"]))
                elem.text = (u"\xa0" * spaces) + (elem.text or "")
            margin = asfloat(style["margin-right"])
            padding = asfloat(style["padding-right"])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style["font-size"]))
                if len(elem) == 0:
                    elem.text = (elem.text or "") + (u"\xa0" * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or "") + (u"\xa0" * spaces)
        if bstate.content and style["page-break-before"] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style["font-size"])
        istate.italic = True if style["font-style"] == "italic" else False
        weight = style["font-weight"]
        istate.bold = weight in ("bold", "bolder") or asfloat(weight) > 400
        istate.preserve = style["white-space"] == "pre"
        istate.pre_wrap = style["white-space"] == "pre-wrap"
        istate.bgcolor = style["background-color"]
        istate.fgcolor = style["color"]
        istate.strikethrough = style.effective_text_decoration == "line-through"
        istate.underline = style.effective_text_decoration == "underline"
        ff = style["font-family"].lower() if hasattr(style["font-family"], "lower") else ""
        if "monospace" in ff or "courier" in ff or ff.endswith(" mono"):
            istate.family = "monospace"
        elif "sans-serif" in ff or "sansserif" in ff or "verdana" in ff or "arial" in ff or "helvetica" in ff:
            istate.family = "sans-serif"
        else:
            istate.family = "serif"
        if "id" in elem.attrib:
            istate.ids.add(elem.attrib["id"])
        if "name" in elem.attrib:
            istate.ids.add(elem.attrib["name"])
        if tag == "a" and "href" in elem.attrib:
            istate.href = elem.attrib["href"]
        istate.attrib.clear()
        if tag == "img" and "src" in elem.attrib:
            istate.attrib["src"] = elem.attrib["src"]
            istate.attrib["align"] = "baseline"
            cssdict = style.cssdict()
            valign = cssdict.get("vertical-align", None)
            if valign in ("top", "bottom", "middle"):
                istate.attrib["align"] = valign
            for prop in ("width", "height"):
                if cssdict[prop] != "auto":
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = "100%"
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(round(float(value) / (72.0 / self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if "width" not in istate.attrib or "height" not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib["src"])
                try:
                    item = self.oeb.manifest.hrefs[urlnormalize(href)]
                except:
                    self.oeb.logger.warn("Failed to find image:", href)
                else:
                    try:
                        width, height = identify_data(item.data)[:2]
                    except:
                        self.oeb.logger.warn("Invalid image:", href)
                    else:
                        if "width" not in istate.attrib and "height" not in istate.attrib:
                            istate.attrib["width"] = str(width)
                            istate.attrib["height"] = str(height)
                        else:
                            ar = float(width) / float(height)
                            if "width" not in istate.attrib:
                                try:
                                    width = int(istate.attrib["height"]) * ar
                                except:
                                    pass
                                istate.attrib["width"] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib["width"]) / ar
                                except:
                                    pass
                                istate.attrib["height"] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == "hr" and asfloat(style["width"]) > 0:
            prop = style["width"] / self.profile.width
            istate.attrib["width"] = "%d%%" % int(round(prop * 100))
        elif display == "table":
            tag = "table"
        elif display == "table-row":
            tag = "tr"
        elif display == "table-cell":
            tag = "td"
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = "span" if tag == "td" else "div"

        if tag in ("table", "td", "tr"):
            col = style.backgroundColor
            if col:
                elem.set("bgcolor", col)
            css = style.cssdict()
            if "border" in css or "border-width" in css:
                elem.set("border", "1")
        if tag in TABLE_TAGS:
            for attr in ("rowspan", "colspan", "width", "border", "scope", "bgcolor"):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == "q":
            t = elem.text
            if not t:
                t = ""
            elem.text = u"\u201c" + t
            t = elem.tail
            if not t:
                t = ""
            elem.tail = u"\u201d" + t
        text = None
        if elem.text:
            if istate.preserve or istate.pre_wrap:
                text = elem.text
            elif (
                len(elem) > 0
                and isspace(elem.text)
                and hasattr(elem[0].tag, "rpartition")
                and elem[0].tag.rpartition("}")[-1] not in INLINE_TAGS
            ):
                text = None
            else:
                text = COLLAPSE.sub(" ", elem.text)
        valign = style["vertical-align"]
        not_baseline = valign in ("super", "sub", "text-top", "text-bottom", "top", "bottom") or (
            isinstance(valign, (float, int)) and abs(valign) != 0
        )
        issup = valign in ("super", "text-top", "top") or (isinstance(valign, (float, int)) and valign > 0)
        vtag = "sup" if issup else "sub"
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(XHTML("html"), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, XHTML("body")))
            vbstate.para = etree.SubElement(vbstate.body, XHTML("p"))
            self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content("span", "", bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, XHTML(vtag))
                vtag = etree.SubElement(vtag, XHTML("small"))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == "blockquote":
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (
            text
            or tag in CONTENT_TAGS
            or tag in NESTABLE_TAGS
            or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids
                and tag in ("a", "span", "i", "b", "u")
                and len(elem) == 0
            )
        ):
            if tag == "li" and len(istates) > 1 and "value" in elem.attrib:
                try:
                    value = int(elem.attrib["value"])
                    istates[-2].list_num = value - 1
                except:
                    pass
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve or istate.pre_wrap:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(" ", child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == "blockquote":
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style["page-break-after"] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == u"\xa0" and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(XHTML("br")))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = asfloat(style["margin-bottom"])
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style["padding-bottom"])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
Exemple #14
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, str) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, str) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM)
            if ems >= 1:
                text.append('\n\n\xa0' * ems)

        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            if tag == 'div':
                tag = 'p'
            text.append(self.build_block(tag, style, attribs, stylizer))
            text.append('. ')
            tags.append('\n')

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:  # noqa
                    if self.in_a_link:
                        text.append('_')
                        tags.append('_')
                    else:
                        text.append('[_')
                        tags.append('_]')
                    self.style_embed.append('_')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:  # noqa
                    if self.in_a_link:
                        text.append('*')
                        tags.append('*')
                    else:
                        text.append('[*')
                        tags.append('*]')
                    self.style_embed.append('*')
                    self.style_bold = True
        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
            if tag != 'a':
                if self.style_under == False:  # noqa
                    text.append('[+')
                    tags.append('+]')
                    self.style_embed.append('+')
                    self.style_under = True
        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
            if self.style_strike == False:  # noqa
                text.append('[-')
                tags.append('-]')
                self.style_embed.append('-')
                self.style_strike = True
        if tag == 'br':
            for i in reversed(self.style_embed):
                text.append(i)
            text.append('\n')
            for i in self.style_embed:
                text.append(i)
            tags.append('')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            text.append('\nbq. ')
            tags.append('\n')
        elif tag in ('abbr', 'acronym'):
            text.append('')
            txt = attribs['title']
            tags.append('(' + txt + ')')
        elif tag == 'sup':
            text.append('^')
            tags.append('^')
        elif tag == 'sub':
            text.append('~')
            tags.append('~')
        elif tag == 'code':
            if self.in_pre:
                text.append('\nbc. ')
                tags.append('')
            else:
                text.append('@')
                tags.append('@')
        elif tag == 'cite':
            text.append('??')
            tags.append('??')
        elif tag == 'hr':
            text.append('\n***')
            tags.append('\n')
        elif tag == 'pre':
            self.in_pre = True
            text.append('\npre. ')
            tags.append('pre\n')
        elif tag == 'a':
            if self.opts.keep_links:
                if 'href' in attribs:  # noqa
                    text.append('"')
                    tags.append('a')
                    tags.append('":' + attribs['href'])
                    self.our_links.append(attribs['href'])
                    if 'title' in attribs:  # noqa
                        tags.append('(' + attribs['title'] + ')')
                    self.in_a_link = True
                else:
                    text.append('%')
                    tags.append('%')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!' + self.check_halign(style)
                txt += self.check_valign(style)
                txt += attribs['src']
                text.append(txt)
                if 'alt' in attribs:  # noqa
                    txt = attribs['alt']
                    if txt != '':
                        text.append('(' + txt + ')')
                tags.append('!')
        elif tag in ('ol', 'ul'):
            self.list.append({'name': tag, 'num': 0})
            text.append('')
            tags.append(tag)
        elif tag == 'li':
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            text.append('\n')
            if li['name'] == 'ul':
                text.append('*' * len(self.list) + ' ')
            elif li['name'] == 'ol':
                text.append('#' * len(self.list) + ' ')
            tags.append('')
        elif tag == 'dl':
            text.append('\n')
            tags.append('')
        elif tag == 'dt':
            text.append('')
            tags.append('\n')
        elif tag == 'dd':
            text.append('    ')
            tags.append('')
        elif tag == 'dd':
            text.append('')
            tags.append('\n')
        elif tag == 'table':
            txt = self.build_block(tag, style, attribs, stylizer)
            txt += '. \n'
            if txt != '\ntable. \n':
                text.append(txt)
            else:
                text.append('\n')
            tags.append('')
        elif tag == 'tr':
            txt = self.build_block('', style, attribs, stylizer)
            txt += '. '
            if txt != '\n. ':
                txt = re.sub('\n', '', txt)
                text.append(txt)
            tags.append('|\n')
        elif tag == 'td':
            text.append('|')
            txt = ''
            txt += self.check_halign(style)
            txt += self.check_valign(style)
            if 'colspan' in attribs:  # noqa
                txt += '\\' + attribs['colspan']
            if 'rowspan' in attribs:  # noqa
                txt += '/' + attribs['rowspan']
            txt += self.check_styles(style)
            if txt != '':
                text.append(txt + '. ')
            tags.append('')
        elif tag == 'th':
            text.append('|_. ')
            tags.append('')
        elif tag == 'span':
            if style['font-variant'] == 'small-caps':
                if self.style_smallcap == False:  # noqa
                    text.append('&')
                    tags.append('&')
                    self.style_smallcap = True
            else:
                if self.in_a_link == False:  # noqa
                    txt = '%'
                    if self.opts.keep_links:
                        txt += self.check_id_tag(attribs)
                        txt += self.check_styles(style)
                    if txt != '%':
                        text.append(txt)
                        tags.append('%')

        if self.opts.keep_links and 'id' in attribs:  # noqa
            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
                text.append(self.check_id_tag(attribs))

        # Process the styles for any that we want to keep
        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img',
                'span', 'table', 'tr', 'td'):
            if not self.in_a_link:
                text.append(self.check_styles(style))

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if not self.in_pre:
                txt = self.prepare_string_for_textile(self.remove_newlines(txt))
            text.append(txt)
            self.id_no_text = ''

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', 'li', 'table'):
                if t == 'pre':
                    self.in_pre = False
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    if not self.list:
                        text.append('\n')
            else:
                if t == 'a':
                    self.in_a_link = False
                    t = ''
                text.append(self.id_no_text)
                self.id_no_text = ''
                if t in ('*]', '*'):
                    self.style_bold = False
                elif t in ('_]', '_'):
                    self.style_italic = False
                elif t == '+]':
                    self.style_under = False
                elif t == '-]':
                    self.style_strike = False
                elif t == '&':
                    self.style_smallcap = False
                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                    txt = self.style_embed.pop()
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
            ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM)
            if ems >= 1:
                text.append('\n\n\xa0' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if not self.in_pre:
                tail = self.prepare_string_for_textile(self.remove_newlines(tail))
            text.append(tail)

        return text
Exemple #15
0
    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        '''
        This function is intended to be used in a recursive manner. dump_text will
        run though all elements in the elem_tree and call itself on each element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
        if not isinstance(elem_tree.tag, string_or_bytes) or namespace(
                elem_tree.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close the tags.
        tags = []
        # First tag in tree
        tag = barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another section,
            # so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC pointed to
                # this page (then we use the first non-<body> on the page as a <title>), or
                # the TOC pointed to a specific element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if tag != 'body' and hasattr(
                                elem_tree, 'text') and elem_tree.text:
                            newlevel = 1
                            self.toc[page.href] = None
                    if not newlevel and elem_tree.attrib.get('id',
                                                             None) is not None:
                        newlevel = toc_entry.get(
                            elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be
        # only one XHTML tag but it can have multiple styles.
        if tag == 'img':
            if elem_tree.attrib.get('src', None):
                # Only write the image tag if it is in the manifest.
                ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
                if ihref in self.oeb_book.manifest.hrefs:
                    if ihref not in self.image_hrefs:
                        self.image_hrefs[ihref] = '_%s.jpg' % len(
                            self.image_hrefs)
                    p_txt, p_tag = self.ensure_p()
                    fb2_out += p_txt
                    tags += p_tag
                    fb2_out.append('<image xlink:href="#%s" />' %
                                   self.image_hrefs[ihref])
                else:
                    self.log.warn(u'Ignoring image not in manifest: %s' %
                                  ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack + tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line />' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line />' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack + tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag in ('del',
                   'strike') or style['text-decoration'] == 'line-through':
            s_out, s_tags = self.handle_simple_tag('strikethrough',
                                                   tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out
Exemple #16
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        style_a = '%s' % style
        style_a = style_a if style_a else ''
        if tag == 'body':
            # Change the body to a div so we can merge multiple files.
            tag = 'div'
            # Add page-break-brefore: always because renders typically treat a new file (we're merging files)
            # as a page break and remove all other page break types that might be set.
            style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
        # Remove unnecessary spaces.
        style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
        tags.append(tag)

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Turn style into strings for putting in the tag.
        style_t = ''
        if style_a:
            style_t = ' style="%s"' % style_a.replace('"', "'")

        # Write the tag.
        text.append('<%s%s%s' % (tag, at, style_t))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #17
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        @page: OEB page used to determine absolute urls.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = barename(elem.tag)
        tag_id = elem.attrib.get('id', None)
        in_block = False
        in_heading = False

        # Are we in a heading?
        # This can either be a heading tag or a TOC item.
        if tag in HEADING_TAGS or '%s#%s' % (page.href,
                                             tag_id) in self.toc_ids:
            in_heading = True
            if not self.last_was_heading:
                text.append('\n\n\n\n\n\n')

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if self.opts.remove_paragraph_spacing and not in_heading:
                text.append('\t')
            in_block = True

        if tag in SPACE_TAGS:
            text.append(' ')

        # Hard scene breaks.
        if tag == 'hr':
            text.append('\n\n* * *\n\n')
        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n' * ems)
        except:
            pass

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(elem.text)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        if in_block:
            text.append('\n\n')
        if in_heading:
            text.append('\n')
            self.last_was_heading = True
        else:
            self.last_was_heading = False

        if hasattr(elem, 'tail') and elem.tail:
            text.append(elem.tail)

        return text
Exemple #18
0
    def dump_text(self, subitems, elem, stylizer, end='', pre=False, li=''):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if elem.attrib.get('id') is not None and elem.attrib['id'] in [href for href, title in subitems]:
            if self.curSubItem is not None and self.curSubItem != elem.attrib['id']:
                self.curSubItem = elem.attrib['id']
                text.append(f'\n\n{CALIBRE_SNB_BM_TAG}{self.curSubItem}\n\n')

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = barename(elem.tag)
        in_block = False

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            in_block = True
            if not end.endswith('\n\n') and hasattr(elem, 'text') and elem.text:
                text.append('\n\n')

        if tag in SPACE_TAGS:
            if not end.endswith('u ') and hasattr(elem, 'text') and elem.text:
                text.append(' ')

        if tag == 'img':
            text.append('\n\n{}{}\n\n'.format(CALIBRE_SNB_IMG_TAG, ProcessFileName(elem.attrib['src'])))

        if tag == 'br':
            text.append('\n\n')

        if tag == 'li':
            li = '- '

        pre = (tag == 'pre' or pre)
        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            if pre:
                text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join((li + elem.text).splitlines()))
            else:
                text.append(li + elem.text)
            li = ''

        for item in elem:
            en = ''
            if len(text) >= 2:
                en = text[-1][-2:]
            t = self.dump_text(subitems, item, stylizer, en, pre, li)[0]
            text += t

        if in_block:
            text.append('\n\n')

        if hasattr(elem, 'tail') and elem.tail:
            if pre:
                text.append(('\n\n%s' % CALIBRE_SNB_PRE_TAG).join(elem.tail.splitlines()))
            else:
                text.append(li + elem.tail)
            li = ''

        return text, li
Exemple #19
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)

        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            if tag == 'div':
                tag = 'p'
            text.append(self.build_block(tag, style, attribs, stylizer))
            text.append('. ')
            tags.append('\n')

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:
                    if self.in_a_link:
                        text.append('_')
                        tags.append('_')
                    else:
                        text.append('[_')
                        tags.append('_]')
                    self.style_embed.append('_')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:
                    if self.in_a_link:
                        text.append('*')
                        tags.append('*')
                    else:
                        text.append('[*')
                        tags.append('*]')
                    self.style_embed.append('*')
                    self.style_bold = True
        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
            if tag != 'a':
                if self.style_under == False:
                    text.append('[+')
                    tags.append('+]')
                    self.style_embed.append('+')
                    self.style_under = True
        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
            if self.style_strike == False:
                text.append('[-')
                tags.append('-]')
                self.style_embed.append('-')
                self.style_strike = True
        if tag == 'br':
            for i in reversed(self.style_embed):
                text.append(i)
            text.append('\n')
            for i in self.style_embed:
                text.append(i)
            tags.append('')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            text.append('\nbq. ')
            tags.append('\n')
        elif tag in ('abbr', 'acronym'):
            text.append('')
            txt = attribs['title']
            tags.append('(' + txt + ')')
        elif tag == 'sup':
            text.append('^')
            tags.append('^')
        elif tag == 'sub':
            text.append('~')
            tags.append('~')
        elif tag == 'code':
            if self.in_pre:
                text.append('\nbc. ')
                tags.append('')
            else:
                text.append('@')
                tags.append('@')
        elif tag == 'cite':
            text.append('??')
            tags.append('??')
        elif tag == 'hr':
            text.append('\n***')
            tags.append('\n')
        elif tag == 'pre':
            self.in_pre = True
            text.append('\npre. ')
            tags.append('pre\n')
        elif tag == 'a':
            if self.opts.keep_links:
                if attribs.has_key('href'):
                    text.append('"')
                    tags.append('a')
                    tags.append('":' + attribs['href'])
                    self.our_links.append(attribs['href'])
                    if attribs.has_key('title'):
                        tags.append('(' + attribs['title'] + ')')
                    self.in_a_link = True
                else:
                    text.append('%')
                    tags.append('%')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!' + self.check_halign(style)
                txt += self.check_valign(style)
                txt += attribs['src']
                text.append(txt)
                if attribs.has_key('alt'):
                    txt = attribs['alt']
                    if txt != '':
                        text.append('(' + txt + ')')
                tags.append('!')
        elif tag in ('ol', 'ul'):
            self.list.append({'name': tag, 'num': 0})
            text.append('')
            tags.append(tag)
        elif tag == 'li':
            if self.list: li = self.list[-1]
            else: li = {'name': 'ul', 'num': 0}
            text.append('\n')
            if li['name'] == 'ul':
                text.append('*' * len(self.list) + ' ')
            elif li['name'] == 'ol':
                text.append('#' * len(self.list) + ' ')
            tags.append('')
        elif tag == 'dl':
            text.append('\n')
            tags.append('')
        elif tag == 'dt':
            text.append('')
            tags.append('\n')
        elif tag == 'dd':
            text.append('    ')
            tags.append('')
        elif tag == 'dd':
            text.append('')
            tags.append('\n')
        elif tag == 'table':
            txt = self.build_block(tag, style, attribs, stylizer)
            txt += '. \n'
            if txt != '\ntable. \n':
                text.append(txt)
            else:
                text.append('\n')
            tags.append('')
        elif tag == 'tr':
            txt = self.build_block('', style, attribs, stylizer)
            txt += '. '
            if txt != '\n. ':
                txt = re.sub ('\n', '', txt)
                text.append(txt)
            tags.append('|\n')
        elif tag == 'td':
            text.append('|')
            txt = ''
            txt += self.check_halign(style)
            txt += self.check_valign(style)
            if attribs.has_key ('colspan'):
                txt += '\\' + attribs['colspan']
            if attribs.has_key ('rowspan'):
                txt += '/' + attribs['rowspan']
            txt += self.check_styles(style)
            if txt != '':
                text.append(txt + '. ')
            tags.append('')
        elif tag == 'th':
            text.append('|_. ')
            tags.append('')
        elif tag == 'span':
            if style['font-variant'] == 'small-caps':
                if self.style_smallcap == False:
                    text.append('&')
                    tags.append('&')
                    self.style_smallcap = True
            else:
                if self.in_a_link == False:
                    txt = '%'
                    if self.opts.keep_links:
                        txt += self.check_id_tag(attribs)
                        txt += self.check_styles(style)
                    if txt != '%':
                        text.append(txt)
                        tags.append('%')

        if self.opts.keep_links and attribs.has_key('id'):
            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
                text.append(self.check_id_tag(attribs))

        # Process the styles for any that we want to keep
        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', \
                'span', 'table', 'tr', 'td'):
            if not self.in_a_link:
                text.append(self.check_styles(style))

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if not self.in_pre:
                txt = self.prepare_string_for_textile(self.remove_newlines(txt))
            text.append(txt)
            self.id_no_text = u''

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', 'li', 'table'):
                if t == 'pre':
                    self.in_pre = False
                elif t in ('ul', 'ol'):
                    if self.list: self.list.pop()
                    if not self.list: text.append('\n')
            else:
                if t == 'a':
                    self.in_a_link = False
                    t = ''
                text.append(self.id_no_text)
                self.id_no_text = u''
                if t in ('*]', '*'):
                    self.style_bold = False
                elif t in ('_]', '_'):
                    self.style_italic = False
                elif t == '+]':
                    self.style_under = False
                elif t == '-]':
                    self.style_strike = False
                elif t == '&':
                    self.style_smallcap = False
                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                    txt = self.style_embed.pop()
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
            ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if not self.in_pre:
                tail = self.prepare_string_for_textile(self.remove_newlines(tail))
            text.append(tail)

        return text
Exemple #20
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag,
                          string_or_bytes) or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        text = []
        tags = []
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        tag = barename(elem.tag)

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            tags.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument.
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(
                        elem.attrib['src']) not in self.image_hrefs.keys():
                    if len(self.image_hrefs.keys()) == 0:
                        self.image_hrefs[page.abshref(
                            elem.attrib['src'])] = 'cover.png'
                    else:
                        self.image_hrefs[page.abshref(
                            elem.attrib['src'])] = image_name(
                                '%s.png' % len(self.image_hrefs.keys()),
                                self.image_hrefs.keys()).strip('\x00')
                text.append('\\m="%s"' %
                            self.image_hrefs[page.abshref(elem.attrib['src'])])
        elif tag == 'hr':
            w = r'\w'
            width = elem.get('width')
            if width:
                if not width.endswith('%'):
                    width += '%'
                w += '="%s"' % width
            else:
                w += '="50%"'
            text.append(w)
        elif tag == 'br':
            text.append('\n\\c \n\\c\n')

        # TOC markers.
        toc_name = elem.attrib.get('name', None)
        toc_id = elem.attrib.get('id', None)
        # Only write the TOC marker if the tag isn't a heading and we aren't in one.
        if (toc_id or toc_name) and tag not in ('h1', 'h2','h3','h4','h5','h6') and \
            'x' not in tag_stack+tags and 'X0' not in tag_stack+tags and \
            'X1' not in tag_stack+tags and 'X2' not in tag_stack+tags and \
            'X3' not in tag_stack+tags and 'X4' not in tag_stack+tags:

            toc_page = page.href
            if self.toc.get(toc_page, None):
                for toc_x in (toc_name, toc_id):
                    toc_title, toc_depth = self.toc[toc_page].get(
                        toc_x, (None, 0))
                    if toc_title:
                        toc_depth = max(min(toc_depth, 4), 0)
                        text.append(r'\C{}="{}"'.format(toc_depth, toc_title))

        # Process style information that needs holds a single tag.
        # Commented out because every page in an OEB book starts with this style.
        if style['page-break-before'] == 'always':
            text.append(r'\p')

        # Process basic PML tags.
        pml_tag = TAG_MAP.get(tag, None)
        if pml_tag and pml_tag not in tag_stack + tags:
            text.append(r'\%s' % pml_tag)
            tags.append(pml_tag)

        # Special processing of tags that require an argument.
        # Anchors links
        if tag in LINK_TAGS and 'q' not in tag_stack + tags:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(
                            self.link_hrefs.keys())
                    href = '#%s' % self.link_hrefs[href]
                    text.append(r'\q="%s"' % href)
                    tags.append('q')

        # Anchor ids
        id_name = elem.get('id')
        name_name = elem.get('name')
        for name_x in (id_name, name_name):
            if name_x:
                text.append(self.get_anchor(page, name_x))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack + tags:
                text.append(r'\%s' % style_tag)
                tags.append(style_tag)

        # margin left
        try:
            mms = int(float(style['margin-left']) * 100 / style.height)
            if mms:
                text.append(r'\T="%s%%"' % mms)
        except:
            pass

        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n\\c \n\\c\n')
        except:
            pass

        # Proccess text within this tag.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_pml(elem.text))

        # Process inner tags
        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack + tags)

        # Close opened tags.
        tags.reverse()
        text += self.close_tags(tags)

        # if tag in SEPARATE_TAGS:
        #    text.append('\n\n')

        if style['page-break-after'] == 'always':
            text.append(r'\p')

        # Process text after this tag but not within another.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_pml(elem.tail))

        return text
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        style_a = '%s' % style
        if tag == 'body':
            tag = 'div'
            if not style['page-break-before'] == 'always':
                style_a = 'page-break-before: always;' + ' ' if style_a else '' + style_a
        tags.append(tag)

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Turn style into strings for putting in the tag.
        style_t = ''
        if style_a:
            style_t = ' style="%s"' % style_a

        # Write the tag.
        text.append('<%s%s%s>' % (tag, at, style_t))

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #22
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return ['']

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Turn styles into tags.
        if style['font-weight'] in ('bold', 'bolder'):
            text.append('<b>')
            tags.append('b')
        if style['font-style'] == 'italic':
            text.append('<i>')
            tags.append('i')
        if style['text-decoration'] == 'underline':
            text.append('<u>')
            tags.append('u')
        if style['text-decoration'] == 'line-through':
            text.append('<s>')
            tags.append('s')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #23
0
    def flatten_node(self, node, stylizer, names, styles, psize, item_id):
        if not isinstance(node.tag, basestring) \
           or namespace(node.tag) != XHTML_NS:
               return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['text-align'] = val
            del node.attrib['align']
        if node.tag == XHTML('font'):
            tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
                'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:
                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))
                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt'%font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            cssdict['color'] = node.attrib['color']
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            cssdict['background-color'] = node.attrib['bgcolor']
            del node.attrib['bgcolor']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict['font-weight'] = 'normal' # ADE chokes on font-weight medium

        fsize = font_size
        if not self.context.disable_font_rescaling:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem'%(fsize/psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt'%fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                cssdict.pop(x, None)

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing or
                self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s'%(prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents and
                cssdict.get('text-align', None) not in ('center', 'right')):
                cssdict['text-indent'] =  "%1.1fem" % indent_size

        if cssdict:
            items = cssdict.items()
            items.sort()
            css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
            classes = node.get('class', '').strip() or 'calibre'
            klass = STRIPNUM.sub('', classes.split()[0].replace('_', ''))
            if css in styles:
                match = styles[css]
            else:
                match = klass + str(names[klass] or '')
                styles[css] = match
                names[klass] += 1
            node.attrib['class'] = match
        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, psize, item_id)
Exemple #24
0
    def dump_text(self, elem, stylizer, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, namespace, barename, urlnormalize

        if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS and elem.tail:
                return elem.tail
            return u""

        text = u""
        style = stylizer.style(elem)

        if style["display"] in ("none", "oeb-page-head", "oeb-page-foot") or style["visibility"] == "hidden":
            if hasattr(elem, "tail") and elem.tail:
                return elem.tail
            return u""

        tag = barename(elem.tag)
        tag_count = 0

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style["display"] in BLOCK_STYLES:
            if "block" not in tag_stack:
                tag_count += 1
                tag_stack.append("block")

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == "img":
            src = elem.get("src")
            if src:
                src = urlnormalize(self.currently_dumping_item.abshref(src))
                block_start = ""
                block_end = ""
                if "block" not in tag_stack:
                    block_start = "{\\par\\pard\\hyphpar "
                    block_end = "}"
                text += "%s SPECIAL_IMAGE-%s-REPLACE_ME %s" % (block_start, src, block_end)

        single_tag = SINGLE_TAGS.get(tag, None)
        if single_tag:
            text += single_tag

        rtf_tag = TAGS.get(tag, None)
        if rtf_tag and rtf_tag not in tag_stack:
            tag_count += 1
            text += "{%s\n" % rtf_tag
            tag_stack.append(rtf_tag)

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += "{%s\n" % style_tag
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, "text") and elem.text:
            text += txt2rtf(elem.text)

        for item in elem:
            text += self.dump_text(item, stylizer, tag_stack)

        for i in range(0, tag_count):
            end_tag = tag_stack.pop()
            if end_tag != "block":
                if tag in BLOCK_TAGS:
                    text += u"\\par\\pard\\plain\\hyphpar}"
                else:
                    text += u"}"

        if hasattr(elem, "tail") and elem.tail:
            if "block" in tag_stack:
                text += "%s" % txt2rtf(elem.tail)
            else:
                text += "{\\par\\pard\\hyphpar %s}" % txt2rtf(elem.tail)

        return text
Exemple #25
0
    def dump_text(self, elem_tree, stylizer, page, tag_stack=[]):
        '''
        This function is intended to be used in a recursive manner. dump_text will
        run though all elements in the elem_tree and call itself on each element.

        self.image_hrefs will be populated by calling this function.

        @param elem_tree: etree representation of XHTML content to be transformed.
        @param stylizer: Used to track the style of elements within the tree.
        @param page: OEB page used to determine absolute urls.
        @param tag_stack: List of open FB2 tags to take into account.

        @return: List of string representing the XHTML converted to FB2 markup.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
        elem = elem_tree

        # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace.
        if not isinstance(elem_tree.tag, basestring) or namespace(elem_tree.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return []

        style = stylizer.style(elem_tree)
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return []

        # FB2 generated output.
        fb2_out = []
        # FB2 tags in the order they are opened. This will be used to close the tags.
        tags = []
        # First tag in tree
        tag = barename(elem_tree.tag)
        # Number of blank lines above tag
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems < 0:
                ems = 0
        except:
            ems = 0

        # Convert TOC entries to <title>s and add <section>s
        if self.opts.sectionize == 'toc':
            # A section cannot be a child of any other element than another section,
            # so leave the tag alone if there are parents
            if not tag_stack:
                # There are two reasons to start a new section here: the TOC pointed to
                # this page (then we use the first non-<body> on the page as a <title>), or
                # the TOC pointed to a specific element
                newlevel = 0
                toc_entry = self.toc.get(page.href, None)
                if toc_entry is not None:
                    if None in toc_entry:
                        if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text:
                            newlevel = 1
                            self.toc[page.href] = None
                    if not newlevel and elem_tree.attrib.get('id', None) is not None:
                        newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None)

                # Start a new section if necessary
                if newlevel:
                    while newlevel <= self.section_level:
                        fb2_out.append('</section>')
                        self.section_level -= 1
                    fb2_out.append('<section>')
                    self.section_level += 1
                    fb2_out.append('<title>')
                    tags.append('title')
            if self.section_level == 0:
                # If none of the prior processing made a section, make one now to be FB2 spec compliant
                fb2_out.append('<section>')
                self.section_level += 1

        # Process the XHTML tag and styles. Converted to an FB2 tag.
        # Use individual if statement not if else. There can be
        # only one XHTML tag but it can have multiple styles.
        if tag == 'img':
            if elem_tree.attrib.get('src', None):
                # Only write the image tag if it is in the manifest.
                ihref = urlnormalize(page.abshref(elem_tree.attrib['src']))
                if ihref in self.oeb_book.manifest.hrefs:
                    if ihref not in self.image_hrefs:
                        self.image_hrefs[ihref] = '_%s.jpg' % len(self.image_hrefs)
                    p_txt, p_tag = self.ensure_p()
                    fb2_out += p_txt
                    tags += p_tag
                    fb2_out.append('<image xlink:href="#%s" />' % self.image_hrefs[ihref])
                else:
                    self.log.warn(u'Ignoring image not in manifest: %s'%ihref)
        if tag in ('br', 'hr') or ems >= 1:
            if ems < 1:
                multiplier = 1
            else:
                multiplier = ems
            if self.in_p:
                closed_tags = []
                open_tags = tag_stack+tags
                open_tags.reverse()
                for t in open_tags:
                    fb2_out.append('</%s>' % t)
                    closed_tags.append(t)
                    if t == 'p':
                        break
                fb2_out.append('<empty-line />' * multiplier)
                closed_tags.reverse()
                for t in closed_tags:
                    fb2_out.append('<%s>' % t)
            else:
                fb2_out.append('<empty-line />' * multiplier)
        if tag in ('div', 'li', 'p'):
            p_text, added_p = self.close_open_p(tag_stack+tags)
            fb2_out += p_text
            if added_p:
                tags.append('p')
        if tag == 'b' or style['font-weight'] in ('bold', 'bolder'):
            s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'i' or style['font-style'] == 'italic':
            s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag in ('del', 'strike') or style['text-decoration'] == 'line-through':
            s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sub':
            s_out, s_tags = self.handle_simple_tag('sub', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags
        if tag == 'sup':
            s_out, s_tags = self.handle_simple_tag('sup', tag_stack+tags)
            fb2_out += s_out
            tags += s_tags

        # Process element text.
        if hasattr(elem_tree, 'text') and elem_tree.text:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.text))
            if not self.in_p:
                fb2_out.append('</p>')

        # Process sub-elements.
        for item in elem_tree:
            fb2_out += self.dump_text(item, stylizer, page, tag_stack+tags)

        # Close open FB2 tags.
        tags.reverse()
        fb2_out += self.close_tags(tags)

        # Process element text that comes after the close of the XHTML tag but before the next XHTML tag.
        if hasattr(elem_tree, 'tail') and elem_tree.tail:
            if not self.in_p:
                fb2_out.append('<p>')
            fb2_out.append(prepare_string_for_xml(elem_tree.tail))
            if not self.in_p:
                fb2_out.append('</p>')

        return fb2_out
Exemple #26
0
    def dump_text(self, elem, stylizer, tag_stack=[]):
        from calibre.ebooks.oeb.base import (XHTML_NS, namespace, barename,
                urlnormalize)

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return elem.tail
            return u''

        text = u''
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return elem.tail
            return u''

        tag = barename(elem.tag)
        tag_count = 0

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if 'block' not in tag_stack:
                tag_count += 1
                tag_stack.append('block')

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag == 'img':
            src = elem.get('src')
            if src:
                src = urlnormalize(self.currently_dumping_item.abshref(src))
                block_start = ''
                block_end = ''
                if 'block' not in tag_stack:
                    block_start = '{\\par\\pard\\hyphpar '
                    block_end = '}'
                text += '%s SPECIAL_IMAGE-%s-REPLACE_ME %s' % (block_start, src, block_end)

        single_tag = SINGLE_TAGS.get(tag, None)
        if single_tag:
            text += single_tag

        rtf_tag = TAGS.get(tag, None)
        if rtf_tag and rtf_tag not in tag_stack:
            tag_count += 1
            text += '{%s\n' % rtf_tag
            tag_stack.append(rtf_tag)

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag and style_tag not in tag_stack:
                tag_count += 1
                text += '{%s\n' % style_tag
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text += txt2rtf(elem.text)

        for item in elem:
            text += self.dump_text(item, stylizer, tag_stack)

        for i in range(0, tag_count):
            end_tag =  tag_stack.pop()
            if end_tag != 'block':
                if tag in BLOCK_TAGS:
                    text += u'\\par\\pard\\plain\\hyphpar}'
                else:
                    text += u'}'

        if hasattr(elem, 'tail') and elem.tail:
            if 'block' in tag_stack:
                text += '%s' % txt2rtf(elem.tail)
            else:
                text += '{\\par\\pard\\hyphpar %s}' % txt2rtf(elem.tail)

        return text
Exemple #27
0
    def mobimlize_elem(self,
                       elem,
                       stylizer,
                       bstate,
                       istates,
                       ignore_valign=False):
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            return
        style = stylizer.style(elem)
        # <mbp:frame-set/> does not exist lalalala
        if ((style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot')
             or style['visibility'] == 'hidden') and
                elem.get('data-calibre-jacket-searchable-tags', None) != '1'):
            id_ = elem.get('id', None)
            if id_:
                # Keep anchors so people can use display:none
                # to generate hidden TOCs
                tail = elem.tail
                elem.clear()
                elem.text = None
                elem.set('id', id_)
                elem.tail = tail
                elem.tag = XHTML('a')
            else:
                return
        tag = barename(elem.tag)
        istate = copy.copy(istates[-1])
        istate.rendered = False
        istate.list_num = 0
        if tag == 'ol' and 'start' in elem.attrib:
            try:
                istate.list_num = int(elem.attrib['start']) - 1
            except:
                pass
        istates.append(istate)
        left = 0
        display = style['display']
        if display == 'table-cell':
            display = 'inline'
        elif display.startswith('table'):
            display = 'block'
        isblock = (not display.startswith('inline')
                   and style['display'] != 'none')
        isblock = isblock and style['float'] == 'none'
        isblock = isblock and tag != 'br'
        if isblock:
            bstate.para = None
            istate.halign = style['text-align']
            rawti = style._get('text-indent')
            try:
                istate.indent = style['text-indent']
            except Exception:
                istate.indent = 0
            if hasattr(rawti, 'strip') and '%' in rawti:
                # We have a percentage text indent, these can come out looking
                # too large if the user chooses a wide output profile like
                # tablet
                istate.indent = min(style._unit_convert(rawti, base=500),
                                    istate.indent)
            if style['margin-left'] == 'auto' \
               and style['margin-right'] == 'auto':
                istate.halign = 'center'
            margin = convert_margin(style, 'margin-left')
            padding = asfloat(style['padding-left'])
            if tag != 'body':
                left = margin + padding
            istate.left += left
            vmargin = convert_margin(style, 'margin-top')
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-top'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        elif not istate.href:
            margin = convert_margin(style, 'margin-left')
            padding = asfloat(style['padding-left'])
            lspace = margin + padding
            if lspace > 0:
                spaces = int(round((lspace * 3) / style['font-size']))
                elem.text = ('\xa0' * spaces) + (elem.text or '')
            margin = convert_margin(style, 'margin-right')
            padding = asfloat(style['padding-right'])
            rspace = margin + padding
            if rspace > 0:
                spaces = int(round((rspace * 3) / style['font-size']))
                if len(elem) == 0:
                    elem.text = (elem.text or '') + ('\xa0' * spaces)
                else:
                    last = elem[-1]
                    last.text = (last.text or '') + ('\xa0' * spaces)
        if bstate.content and style['page-break-before'] in PAGE_BREAKS:
            bstate.pbreak = True
        istate.fsize = self.mobimlize_font(style['font-size'])
        istate.italic = True if style['font-style'] == 'italic' else False
        weight = style['font-weight']
        istate.bold = weight in ('bold', 'bolder') or asfloat(weight) > 400
        istate.preserve = style['white-space'] == 'pre'
        istate.pre_wrap = style['white-space'] == 'pre-wrap'
        istate.bgcolor = style['background-color']
        istate.fgcolor = style['color']
        istate.strikethrough = style.effective_text_decoration == 'line-through'
        istate.underline = style.effective_text_decoration == 'underline'
        ff = style['font-family'].lower() if hasattr(style['font-family'],
                                                     'lower') else ''
        if 'monospace' in ff or 'courier' in ff or ff.endswith(' mono'):
            istate.family = 'monospace'
        elif ('sans-serif' in ff or 'sansserif' in ff or 'verdana' in ff
              or 'arial' in ff or 'helvetica' in ff):
            istate.family = 'sans-serif'
        else:
            istate.family = 'serif'
        if 'id' in elem.attrib:
            istate.ids.add(elem.attrib['id'])
        if 'name' in elem.attrib:
            istate.ids.add(elem.attrib['name'])
        if tag == 'a' and 'href' in elem.attrib:
            istate.href = elem.attrib['href']
        istate.attrib.clear()
        if tag == 'img' and 'src' in elem.attrib:
            istate.attrib['src'] = elem.attrib['src']
            istate.attrib['align'] = 'baseline'
            cssdict = style.cssdict()
            valign = cssdict.get('vertical-align', None)
            if valign in ('top', 'bottom', 'middle'):
                istate.attrib['align'] = valign
            for prop in ('width', 'height'):
                if cssdict[prop] != 'auto':
                    value = style[prop]
                    if value == getattr(self.profile, prop):
                        result = '100%'
                    else:
                        # Amazon's renderer does not support
                        # img sizes in units other than px
                        # See #7520 for test case
                        try:
                            pixs = int(
                                round(float(value) / (72 / self.profile.dpi)))
                        except:
                            continue
                        result = str(pixs)
                    istate.attrib[prop] = result
            if 'width' not in istate.attrib or 'height' not in istate.attrib:
                href = self.current_spine_item.abshref(elem.attrib['src'])
                try:
                    item = self.oeb.manifest.hrefs[urlnormalize(href)]
                except:
                    self.oeb.logger.warn('Failed to find image:', href)
                else:
                    try:
                        width, height = identify(item.data)[1:]
                    except Exception:
                        self.oeb.logger.warn('Invalid image:', href)
                    else:
                        if 'width' not in istate.attrib and 'height' not in \
                                    istate.attrib:
                            istate.attrib['width'] = str(width)
                            istate.attrib['height'] = str(height)
                        else:
                            ar = width / height
                            if 'width' not in istate.attrib:
                                try:
                                    width = int(istate.attrib['height']) * ar
                                except:
                                    pass
                                istate.attrib['width'] = str(int(width))
                            else:
                                try:
                                    height = int(istate.attrib['width']) / ar
                                except:
                                    pass
                                istate.attrib['height'] = str(int(height))
                        item.unload_data_from_memory()
        elif tag == 'hr' and asfloat(style['width']) > 0 and style._get(
                'width') not in {'100%', 'auto'}:
            raww = style._get('width')
            if hasattr(raww, 'strip') and '%' in raww:
                istate.attrib['width'] = raww
            else:
                prop = style['width'] / self.profile.width
                istate.attrib['width'] = "%d%%" % int(round(prop * 100))
        elif display == 'table':
            tag = 'table'
        elif display == 'table-row':
            tag = 'tr'
        elif display == 'table-cell':
            tag = 'td'
        if tag in TABLE_TAGS and self.ignore_tables:
            tag = 'span' if tag == 'td' else 'div'

        if tag in ('table', 'td', 'tr'):
            col = style.backgroundColor
            if col:
                elem.set('bgcolor', col)
            css = style.cssdict()
            if 'border' in css or 'border-width' in css:
                elem.set('border', '1')
        if tag in TABLE_TAGS:
            for attr in ('rowspan', 'colspan', 'width', 'border', 'scope',
                         'bgcolor'):
                if attr in elem.attrib:
                    istate.attrib[attr] = elem.attrib[attr]
        if tag == 'q':
            t = elem.text
            if not t:
                t = ''
            elem.text = '\u201c' + t
            t = elem.tail
            if not t:
                t = ''
            elem.tail = '\u201d' + t
        text = None
        if elem.text:
            if istate.preserve or istate.pre_wrap:
                text = elem.text
            elif (len(elem) > 0 and isspace(elem.text)
                  and hasattr(elem[0].tag, 'rpartition')
                  and elem[0].tag.rpartition('}')[-1] not in INLINE_TAGS):
                text = None
            else:
                text = COLLAPSE.sub(' ', elem.text)
        valign = style['vertical-align']
        not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom',
                                  'top', 'bottom') or (isinstance(
                                      valign, numbers.Number)
                                                       and abs(valign) != 0)
        issup = valign in ('super', 'text-top', 'top') or (isinstance(
            valign, numbers.Number) and valign > 0)
        vtag = 'sup' if issup else 'sub'
        if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock:
            nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP)
            vbstate = BlockState(etree.SubElement(nroot, XHTML('body')))
            vbstate.para = etree.SubElement(vbstate.body, XHTML('p'))
            self.mobimlize_elem(elem,
                                stylizer,
                                vbstate,
                                istates,
                                ignore_valign=True)
            if len(istates) > 0:
                istates.pop()
            if len(istates) == 0:
                istates.append(FormatState())
            at_start = bstate.para is None
            if at_start:
                self.mobimlize_content('span', '', bstate, istates)
            parent = bstate.para if bstate.inline is None else bstate.inline
            if parent is not None:
                vtag = etree.SubElement(parent, XHTML(vtag))
                vtag = etree.SubElement(vtag, XHTML('small'))
                # Add anchors
                for child in vbstate.body:
                    if child is not vbstate.para:
                        vtag.append(child)
                    else:
                        break
                if vbstate.para is not None:
                    if vbstate.para.text:
                        vtag.text = vbstate.para.text
                    for child in vbstate.para:
                        vtag.append(child)
                return

        if tag == 'blockquote':
            old_mim = self.opts.mobi_ignore_margins
            self.opts.mobi_ignore_margins = False

        if (text or tag in CONTENT_TAGS or tag in NESTABLE_TAGS or (
                # We have an id but no text and no children, the id should still
                # be added.
                istate.ids and tag in ('a', 'span', 'i', 'b', 'u')
                and len(elem) == 0)):
            if tag == 'li' and len(istates) > 1 and 'value' in elem.attrib:
                try:
                    value = int(elem.attrib['value'])
                    istates[-2].list_num = value - 1
                except:
                    pass
            self.mobimlize_content(tag, text, bstate, istates)
        for child in elem:
            self.mobimlize_elem(child, stylizer, bstate, istates)
            tail = None
            if child.tail:
                if istate.preserve or istate.pre_wrap:
                    tail = child.tail
                elif bstate.para is None and isspace(child.tail):
                    tail = None
                else:
                    tail = COLLAPSE.sub(' ', child.tail)
            if tail:
                self.mobimlize_content(tag, tail, bstate, istates)

        if tag == 'blockquote':
            self.opts.mobi_ignore_margins = old_mim

        if bstate.content and style['page-break-after'] in PAGE_BREAKS:
            bstate.pbreak = True
        if isblock:
            para = bstate.para
            if para is not None and para.text == '\xa0' and len(para) < 1:
                if style.height > 2:
                    para.getparent().replace(para, etree.Element(XHTML('br')))
                else:
                    # This is too small to be rendered effectively, drop it
                    para.getparent().remove(para)
            bstate.para = None
            bstate.istate = None
            vmargin = convert_margin(style, 'margin-bottom')
            bstate.vmargin = max((bstate.vmargin, vmargin))
            vpadding = asfloat(style['padding-bottom'])
            if vpadding > 0:
                bstate.vpadding += bstate.vmargin
                bstate.vmargin = 0
                bstate.vpadding += vpadding
        if bstate.nested and bstate.nested[-1].tag == elem.tag:
            bstate.nested.pop()
        istates.pop()
Exemple #28
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag,
                          string_or_bytes) or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = barename(elem.tag)
        tag_count = 0

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(
                        elem.attrib['src']) not in self.name_map.keys():
                    self.name_map[page.abshref(
                        elem.attrib['src'])] = unique_name(
                            '%s' % len(self.name_map.keys()),
                            self.name_map.keys())
                text.append('<IMG SRC="%s">' %
                            self.name_map[page.abshref(elem.attrib['src'])])

        rb_tag = tag.upper() if tag in TAGS else None
        if rb_tag:
            tag_count += 1
            text.append('<%s>' % rb_tag)
            tag_stack.append(rb_tag)

        # Anchors links
        if tag in LINK_TAGS:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(
                            self.link_hrefs.keys())
                    href = self.link_hrefs[href]
                    text.append('<A HREF="#%s">' % href)
                tag_count += 1
                tag_stack.append('A')

        # Anchor ids
        id_name = elem.get('id')
        if id_name:
            text.append(self.get_anchor(page, id_name))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag:
                style_tag = style_tag.upper()
                tag_count += 1
                text.append('<%s>' % style_tag)
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(prepare_string_for_xml(elem.text))

        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack)

        close_tag_list = []
        for i in range(0, tag_count):
            close_tag_list.insert(0, tag_stack.pop())

        text += self.close_tags(close_tag_list)

        if hasattr(elem, 'tail') and elem.tail:
            text.append(prepare_string_for_xml(elem.tail))

        return text
Exemple #29
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            return ['']

        # Remove attributes we won't want.
        if 'class' in attribs:
            del attribs['class']
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Turn styles into tags.
        if style['font-weight'] in ('bold', 'bolder'):
            text.append('<b>')
            tags.append('b')
        if style['font-style'] == 'italic':
            text.append('<i>')
            tags.append('i')
        if style['text-decoration'] == 'underline':
            text.append('<u>')
            tags.append('u')
        if style['text-decoration'] == 'line-through':
            text.append('<s>')
            tags.append('s')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #30
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, string_or_bytes) \
           or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
            p = elem.getparent()
            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        if tag == 'body':
            tag = 'div'
        tags.append(tag)

        # Remove attributes we won't want.
        if 'style' in attribs:
            del attribs['style']

        # Turn the rest of the attributes into a string we can write with the tag.
        at = ''
        for k, v in attribs.items():
            k = k.split('}')[-1]
            at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))

        # Write the tag.
        text.append('<%s%s' % (tag, at))
        if tag in SELF_CLOSING_TAGS:
            text.append(' />')
        else:
            text.append('>')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(self.prepare_string_for_html(elem.text))

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t not in SELF_CLOSING_TAGS:
                text.append('</%s>' % t)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            text.append(self.prepare_string_for_html(elem.tail))

        return text
Exemple #31
0
    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        @page: OEB page used to determine absolute urls.
        '''
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, basestring) \
           or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = barename(elem.tag)
        tag_id = elem.attrib.get('id', None)
        in_block = False
        in_heading = False

        # Are we in a heading?
        # This can either be a heading tag or a TOC item.
        if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
            in_heading = True
            if not self.last_was_heading:
                text.append('\n\n\n\n\n\n')

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if self.opts.remove_paragraph_spacing and not in_heading:
                text.append(u'\t')
            in_block = True

        if tag in SPACE_TAGS:
            text.append(u' ')

        # Hard scene breaks.
        if tag == 'hr':
            text.append('\n\n* * *\n\n')
        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n' * ems)
        except:
            pass

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(elem.text)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        if in_block:
            text.append(u'\n\n')
        if in_heading:
            text.append(u'\n')
            self.last_was_heading = True
        else:
            self.last_was_heading = False

        if hasattr(elem, 'tail') and elem.tail:
            text.append(elem.tail)

        return text
Exemple #32
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize,
                     item_id):
        if not isinstance(node.tag, basestring) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, (int, float)):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if node.tag == XHTML('font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color',
                                            node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (cssdict.get('float', None) == 'left'
                       and 'font-size' in cssdict and len(node) == 0
                       and node.text and len(node.text) == 1)
        # Detect drop caps generated by the docx input plugin
        if (node.tag and node.tag.endswith('}p') and len(node) == 0
                and node.text and len(node.text.strip()) == 1 and not node.tail
                and 'line-height' in cssdict and 'font-size' in cssdict):
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if not is_drop_cap and style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                cssdict.pop(x, None)

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(cssdict.items())
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                klass = ascii_text(
                    STRIPNUM.sub('',
                                 classes.split()[0].replace('_', '')))
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in pseudo_classes.iteritems():
                items = sorted(cssdict.iteritems())
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles,
                              psize, item_id)
Exemple #33
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id):
        if not isinstance(node.tag, basestring) or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style["font-size"]
        except:
            font_size = self.sbase if self.sbase is not None else self.context.source.fbase
        if "align" in node.attrib:
            if tag != "img":
                cssdict["text-align"] = node.attrib["align"]
            else:
                val = node.attrib["align"]
                if val in ("middle", "bottom", "top"):
                    cssdict["vertical-align"] = val
                elif val in ("left", "right"):
                    cssdict["float"] = val
            del node.attrib["align"]
        if node.tag == XHTML("font"):
            tags = [
                "descendant::h:%s" % x
                for x in ("p", "div", "table", "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul", "dl", "blockquote")
            ]
            tag = "div" if XPath("|".join(tags))(node) else "span"
            node.tag = XHTML(tag)
            if "size" in node.attrib:

                def force_int(raw):
                    return int(re.search(r"([0-9+-]+)", raw).group(1))

                size = node.attrib["size"].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ("+", "-"):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict["font-size"] = "%.1fpt" % font_size
                del node.attrib["size"]
            if "face" in node.attrib:
                cssdict["font-family"] = node.attrib["face"]
                del node.attrib["face"]
        if "color" in node.attrib:
            try:
                cssdict["color"] = Property("color", node.attrib["color"]).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib["color"]
        if "bgcolor" in node.attrib:
            try:
                cssdict["background-color"] = Property("background-color", node.attrib["bgcolor"]).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib["bgcolor"]
        if cssdict.get("font-weight", "").lower() == "medium":
            cssdict["font-weight"] = "normal"  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (
            cssdict.get("float", None) == "left"
            and "font-size" in cssdict
            and len(node) == 0
            and node.text
            and len(node.text) == 1
        )
        is_drop_cap = is_drop_cap or (
            # The docx input plugin generates drop caps that look like this
            len(node) == 1
            and not node.text
            and len(node[0]) == 0
            and node[0].text
            and not node[0].tail
            and len(node[0].text) == 1
            and "line-height" in cssdict
            and "font-size" in cssdict
        )
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict["font-size"] = "%0.5fem" % (fsize / psize)
                psize = fsize
            elif "font-size" in cssdict or tag == "body":
                fsize = self.fmap[font_size]
                try:
                    cssdict["font-size"] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict["font-size"] = "%.1fpt" % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.0
            if not is_drop_cap and style["line-height"] < minlh * fsize:
                cssdict["line-height"] = str(minlh)
        except:
            self.oeb.logger.exception("Failed to set minimum line-height")

        if cssdict:
            for x in self.filter_css:
                cssdict.pop(x, None)

        if cssdict:
            if self.lineh and self.fbase and tag != "body":
                self.clean_edges(cssdict, style, psize)
            if "display" in cssdict and cssdict["display"] == "in-line":
                cssdict["display"] = "inline"
            if self.unfloat and "float" in cssdict and cssdict.get("display", "none") != "none":
                del cssdict["display"]
            if self.untable and "display" in cssdict and cssdict["display"].startswith("table"):
                display = cssdict["display"]
                if display == "table-cell":
                    cssdict["display"] = "inline"
                else:
                    cssdict["display"] = "block"
            if "vertical-align" in cssdict and cssdict["vertical-align"] == "sup":
                cssdict["vertical-align"] = "super"
        if self.lineh and "line-height" not in cssdict:
            lineh = self.lineh / psize
            cssdict["line-height"] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ("p", "div"):
            if item_id != "calibre_jacket" or self.context.output_profile.name == "Kindle":
                for prop in ("margin", "padding", "border"):
                    for edge in ("top", "bottom"):
                        cssdict["%s-%s" % (prop, edge)] = "0pt"
            if self.context.insert_blank_line:
                cssdict["margin-top"] = cssdict["margin-bottom"] = "%fem" % self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (
                self.context.remove_paragraph_spacing
                and not keep_indents
                and cssdict.get("text-align", None) not in ("center", "right")
            ):
                cssdict["text-indent"] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(cssdict.items())
                css = u";\n".join(u"%s: %s" % (key, val) for key, val in items)
                classes = node.get("class", "").strip() or "calibre"
                klass = ascii_text(STRIPNUM.sub("", classes.split()[0].replace("_", "")))
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or "")
                    styles[css] = match
                    names[klass] += 1
                node.attrib["class"] = match
                keep_classes.add(match)

            for psel, cssdict in pseudo_classes.iteritems():
                items = sorted(cssdict.iteritems())
                css = u";\n".join(u"%s: %s" % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = "pcalibre"
                    match = klass + str(names[klass] or "")
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib["class"] = " ".join(keep_classes)

        elif "class" in node.attrib:
            del node.attrib["class"]
        if "style" in node.attrib:
            del node.attrib["style"]
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
Exemple #34
0
    def dump_text(self, elem, stylizer, page, tag_stack=[]):
        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace

        if not isinstance(elem.tag, basestring) or namespace(elem.tag) != XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, basestring) and namespace(p.tag) == XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return [u'']

        text = [u'']
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return [u'']

        tag = barename(elem.tag)
        tag_count = 0

        # Process tags that need special processing and that do not have inner
        # text. Usually these require an argument
        if tag in IMAGE_TAGS:
            if elem.attrib.get('src', None):
                if page.abshref(elem.attrib['src']) not in self.name_map.keys():
                    self.name_map[page.abshref(elem.attrib['src'])] = unique_name('%s' % len(self.name_map.keys()), self.name_map.keys())
                text.append('<IMG SRC="%s">' % self.name_map[page.abshref(elem.attrib['src'])])

        rb_tag = tag.upper() if tag in TAGS else None
        if rb_tag:
            tag_count += 1
            text.append('<%s>' % rb_tag)
            tag_stack.append(rb_tag)

        # Anchors links
        if tag in LINK_TAGS:
            href = elem.get('href')
            if href:
                href = page.abshref(href)
                if '://' not in href:
                    if '#' not in href:
                        href += '#'
                    if href not in self.link_hrefs.keys():
                        self.link_hrefs[href] = 'calibre_link-%s' % len(self.link_hrefs.keys())
                    href = self.link_hrefs[href]
                    text.append('<A HREF="#%s">' % href)
                tag_count += 1
                tag_stack.append('A')

        # Anchor ids
        id_name = elem.get('id')
        if id_name:
            text.append(self.get_anchor(page, id_name))

        # Processes style information
        for s in STYLES:
            style_tag = s[1].get(style[s[0]], None)
            if style_tag:
                style_tag = style_tag.upper()
                tag_count += 1
                text.append('<%s>' % style_tag)
                tag_stack.append(style_tag)

        # Proccess tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(prepare_string_for_xml(elem.text))

        for item in elem:
            text += self.dump_text(item, stylizer, page, tag_stack)

        close_tag_list = []
        for i in range(0, tag_count):
            close_tag_list.insert(0, tag_stack.pop())

        text += self.close_tags(close_tag_list)

        if hasattr(elem, 'tail') and elem.tail:
            text.append(prepare_string_for_xml(elem.tail))

        return text
Exemple #35
0
    def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id):
        if not isinstance(node.tag, string_or_bytes) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(XHTML("table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get('margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == XHTML('font'):
            tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
                'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:
                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))
                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt'%font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color', node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict['font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (cssdict.get('float', None) == 'left' and 'font-size' in cssdict and len(node) == 0 and node.text and (
            len(node.text) == 1 or (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = dynamic_rescale_factor(node)
            if dyn_rescale is not None:
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem'%(fsize/psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt'%fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            if not is_drop_cap and style['line-height'] < minlh * fsize:
                cssdict['line-height'] = str(minlh)
        except:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s'%(prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents and cssdict.get('text-align', None) not in ('center', 'right')):
                cssdict['text-indent'] =  "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub('', classes.split()[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + str(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in iteritems(pseudo_classes):
                items = sorted(iteritems(cssdict))
                css = u';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + str(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        for child in node:
            self.flatten_node(child, stylizer, names, styles, pseudo_styles, psize, item_id)
Exemple #36
0
    def flatten_node(self,
                     node,
                     stylizer,
                     names,
                     styles,
                     pseudo_styles,
                     psize,
                     item_id,
                     recurse=True):
        if not isinstance(node.tag, string_or_bytes) \
           or namespace(node.tag) != XHTML_NS:
            return
        tag = barename(node.tag)
        style = stylizer.style(node)
        cssdict = style.cssdict()
        try:
            font_size = style['font-size']
        except:
            font_size = self.sbase if self.sbase is not None else \
                self.context.source.fbase
        if tag == 'body' and isinstance(font_size, numbers.Number):
            stylizer.body_font_size = font_size
        if 'align' in node.attrib:
            if tag != 'img':
                cssdict['text-align'] = node.attrib['align']
                if cssdict['text-align'] == 'center':
                    # align=center causes tables to be center aligned,
                    # which text-align does not. And the ever trustworthy Word
                    # uses this construct in its HTML output. See
                    # https://bugs.launchpad.net/bugs/1569583
                    if tag == 'table':
                        if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
                            cssdict['margin-left'] = cssdict[
                                'margin-right'] = 'auto'
                    else:
                        for table in node.iterchildren(XHTML("table")):
                            ts = stylizer.style(table)
                            if ts.get('margin-left') is None and ts.get(
                                    'margin-right') is None:
                                ts.set('margin-left', 'auto')
                                ts.set('margin-right', 'auto')
            else:
                val = node.attrib['align']
                if val in ('middle', 'bottom', 'top'):
                    cssdict['vertical-align'] = val
                elif val in ('left', 'right'):
                    cssdict['float'] = val
            del node.attrib['align']
        if 'valign' in node.attrib and tag == 'td':
            if cssdict.get('vertical-align') == 'inherit':
                cssdict['vertical-align'] = node.attrib['valign']
            del node.attrib['valign']
        if node.tag == XHTML('font'):
            tags = [
                'descendant::h:%s' % x
                for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5',
                          'h6', 'ol', 'ul', 'dl', 'blockquote')
            ]
            tag = 'div' if XPath('|'.join(tags))(node) else 'span'
            node.tag = XHTML(tag)
            if 'size' in node.attrib:

                def force_int(raw):
                    return int(re.search(r'([0-9+-]+)', raw).group(1))

                size = node.attrib['size'].strip()
                if size:
                    fnums = self.context.source.fnums
                    if size[0] in ('+', '-'):
                        # Oh, the warcrimes
                        try:
                            esize = 3 + force_int(size)
                        except:
                            esize = 3
                        if esize < 1:
                            esize = 1
                        if esize > 7:
                            esize = 7
                        font_size = fnums[esize]
                    else:
                        try:
                            font_size = fnums[force_int(size)]
                        except:
                            font_size = fnums[3]
                    cssdict['font-size'] = '%.1fpt' % font_size
                del node.attrib['size']
            if 'face' in node.attrib:
                cssdict['font-family'] = node.attrib['face']
                del node.attrib['face']
        if 'color' in node.attrib:
            try:
                cssdict['color'] = Property('color',
                                            node.attrib['color']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['color']
        if 'bgcolor' in node.attrib:
            try:
                cssdict['background-color'] = Property(
                    'background-color', node.attrib['bgcolor']).value
            except (ValueError, SyntaxErr):
                pass
            del node.attrib['bgcolor']
        if tag == 'ol' and 'type' in node.attrib:
            del node.attrib['type']
        if cssdict.get('font-weight', '').lower() == 'medium':
            cssdict[
                'font-weight'] = 'normal'  # ADE chokes on font-weight medium

        fsize = font_size
        is_drop_cap = (
            cssdict.get('float', None) == 'left' and 'font-size' in cssdict
            and len(node) == 0 and node.text and
            (len(node.text) == 1 or
             (len(node.text) == 2 and 0x2000 <= ord(node.text[0]) <= 0x206f)))
        # Detect drop caps generated by the docx input plugin
        if node.tag and node.tag.endswith('}p') and len(node) == 0 and node.text and len(node.text.strip()) == 1 and \
                not node.tail and 'line-height' in cssdict and 'font-size' in cssdict:
            dp = node.getparent()
            if dp.tag and dp.tag.endswith('}div') and len(
                    dp) == 1 and not dp.text:
                if stylizer.style(dp).cssdict().get('float', None) == 'left':
                    is_drop_cap = True
        if not self.context.disable_font_rescaling and not is_drop_cap:
            _sbase = self.sbase if self.sbase is not None else \
                self.context.source.fbase
            dyn_rescale = node.attrib.pop('data-calibre-rescale', None)
            if dyn_rescale is not None:
                try:
                    dyn_rescale = float(dyn_rescale) / 100
                except Exception:
                    dyn_rescale = 1
                fsize = self.fmap[_sbase]
                fsize *= dyn_rescale
                cssdict['font-size'] = '%0.5fem' % (fsize / psize)
                psize = fsize
            elif 'font-size' in cssdict or tag == 'body':
                fsize = self.fmap[font_size]
                try:
                    cssdict['font-size'] = "%0.5fem" % (fsize / psize)
                except ZeroDivisionError:
                    cssdict['font-size'] = '%.1fpt' % fsize
                psize = fsize

        try:
            minlh = self.context.minimum_line_height / 100.
            slh = style['line-height']
            if not is_drop_cap and isinstance(
                    slh, numbers.Number) and slh < minlh * fsize:
                cssdict['line-height'] = unicode_type(minlh)
        except Exception:
            self.oeb.logger.exception('Failed to set minimum line-height')

        if cssdict:
            for x in self.filter_css:
                popval = cssdict.pop(x, None)
                if self.body_font_family and popval and x == 'font-family' \
                    and popval.partition(',')[0][1:-1] == self.body_font_family.partition(',')[0][1:-1]:
                    cssdict[x] = popval

        if cssdict:
            if self.lineh and self.fbase and tag != 'body':
                self.clean_edges(cssdict, style, psize)
            if 'display' in cssdict and cssdict['display'] == 'in-line':
                cssdict['display'] = 'inline'
            if self.unfloat and 'float' in cssdict \
               and cssdict.get('display', 'none') != 'none':
                del cssdict['display']
            if self.untable and 'display' in cssdict \
               and cssdict['display'].startswith('table'):
                display = cssdict['display']
                if display == 'table-cell':
                    cssdict['display'] = 'inline'
                else:
                    cssdict['display'] = 'block'
            if 'vertical-align' in cssdict \
               and cssdict['vertical-align'] == 'sup':
                cssdict['vertical-align'] = 'super'
        if self.lineh and 'line-height' not in cssdict:
            lineh = self.lineh / psize
            cssdict['line-height'] = "%0.5fem" % lineh

        if (self.context.remove_paragraph_spacing
                or self.context.insert_blank_line) and tag in ('p', 'div'):
            if item_id != 'calibre_jacket' or self.context.output_profile.name == 'Kindle':
                for prop in ('margin', 'padding', 'border'):
                    for edge in ('top', 'bottom'):
                        cssdict['%s-%s' % (prop, edge)] = '0pt'
            if self.context.insert_blank_line:
                cssdict['margin-top'] = cssdict['margin-bottom'] = \
                    '%fem'%self.context.insert_blank_line_size
            indent_size = self.context.remove_paragraph_spacing_indent_size
            keep_indents = indent_size < 0.0
            if (self.context.remove_paragraph_spacing and not keep_indents
                    and cssdict.get('text-align',
                                    None) not in ('center', 'right')):
                cssdict['text-indent'] = "%1.1fem" % indent_size

        pseudo_classes = style.pseudo_classes(self.filter_css)
        if cssdict or pseudo_classes:
            keep_classes = set()

            if cssdict:
                items = sorted(iteritems(cssdict))
                css = ';\n'.join(u'%s: %s' % (key, val) for key, val in items)
                classes = node.get('class', '').strip() or 'calibre'
                classes_list = classes.split()
                # lower() because otherwise if the document uses the same class
                # name with different case, both cases will apply, leading
                # to incorrect results.
                klass = ascii_text(STRIPNUM.sub(
                    '', classes_list[0])).lower().strip().replace(' ', '_')
                if css in styles:
                    match = styles[css]
                else:
                    match = klass + unicode_type(names[klass] or '')
                    styles[css] = match
                    names[klass] += 1
                node.attrib['class'] = match
                keep_classes.add(match)

            for psel, cssdict in iteritems(pseudo_classes):
                items = sorted(iteritems(cssdict))
                css = ';\n'.join('%s: %s' % (key, val) for key, val in items)
                pstyles = pseudo_styles[psel]
                if css in pstyles:
                    match = pstyles[css]
                else:
                    # We have to use a different class for each psel as
                    # otherwise you can have incorrect styles for a situation
                    # like: a:hover { color: red } a:link { color: blue } a.x:hover { color: green }
                    # If the pcalibre class for a:hover and a:link is the same,
                    # then the class attribute for a.x tags will contain both
                    # that class and the class for a.x:hover, which is wrong.
                    klass = 'pcalibre'
                    match = klass + unicode_type(names[klass] or '')
                    pstyles[css] = match
                    names[klass] += 1
                keep_classes.add(match)
                node.attrib['class'] = ' '.join(keep_classes)

        elif 'class' in node.attrib:
            del node.attrib['class']
        if 'style' in node.attrib:
            del node.attrib['style']
        if recurse:
            for child in node:
                self.flatten_node(child, stylizer, names, styles,
                                  pseudo_styles, psize, item_id)