Example #1
0
 def postprocess_book(self, oeb, opts, log):
     from ebook_converter.ebooks.oeb.base import XHTML, barename
     for item in oeb.spine:
         if hasattr(item.data, 'xpath'):
             for heading in item.data.iterdescendants(
                     *map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
                 if not len(heading):
                     continue
                 span = heading[0]
                 if not heading.text and not span.text and not len(
                         span) and barename(span.tag) == 'span':
                     if not heading.get('id') and span.get('id'):
                         heading.set('id', span.get('id'))
                         heading.text = span.tail
                         heading.remove(span)
                 if len(heading) == 1 and heading[0].get(
                         'style') == 'text-align: center; margin: auto;':
                     div = heading[0]
                     if barename(div.tag) == 'div' and not len(
                             div) and not div.get('id') and not heading.get(
                                 'style'):
                         heading.text = (heading.text or '') + (
                             div.text or '') + (div.tail or '')
                         heading.remove(div)
                         heading.set('style', 'text-align: center')
Example #2
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, (str, bytes)) \
           or namespace(elem.tag) != const.XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = ['']
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = min(int(round(float(style.marginTop) / style.fontSize) - 1),
                      self.MAX_EM)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)

        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            if tag == 'div':
                tag = 'p'
            text.append(self.build_block(tag, style, attribs, stylizer))
            text.append('. ')
            tags.append('\n')

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:  # noqa
                    if self.in_a_link:
                        text.append('_')
                        tags.append('_')
                    else:
                        text.append('[_')
                        tags.append('_]')
                    self.style_embed.append('_')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b',
                                                                 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:  # noqa
                    if self.in_a_link:
                        text.append('*')
                        tags.append('*')
                    else:
                        text.append('[*')
                        tags.append('*]')
                    self.style_embed.append('*')
                    self.style_bold = True
        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
            if tag != 'a':
                if self.style_under == False:  # noqa
                    text.append('[+')
                    tags.append('+]')
                    self.style_embed.append('+')
                    self.style_under = True
        if style['text-decoration'] == 'line-through' or tag in ('strike',
                                                                 'del', 's'):
            if self.style_strike == False:  # noqa
                text.append('[-')
                tags.append('-]')
                self.style_embed.append('-')
                self.style_strike = True
        if tag == 'br':
            for i in reversed(self.style_embed):
                text.append(i)
            text.append('\n')
            for i in self.style_embed:
                text.append(i)
            tags.append('')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            text.append('\nbq. ')
            tags.append('\n')
        elif tag in ('abbr', 'acronym'):
            text.append('')
            txt = attribs['title']
            tags.append('(' + txt + ')')
        elif tag == 'sup':
            text.append('^')
            tags.append('^')
        elif tag == 'sub':
            text.append('~')
            tags.append('~')
        elif tag == 'code':
            if self.in_pre:
                text.append('\nbc. ')
                tags.append('')
            else:
                text.append('@')
                tags.append('@')
        elif tag == 'cite':
            text.append('??')
            tags.append('??')
        elif tag == 'hr':
            text.append('\n***')
            tags.append('\n')
        elif tag == 'pre':
            self.in_pre = True
            text.append('\npre. ')
            tags.append('pre\n')
        elif tag == 'a':
            if self.opts.keep_links:
                if 'href' in attribs:
                    text.append('"')
                    tags.append('a')
                    tags.append('":' + attribs['href'])
                    self.our_links.append(attribs['href'])
                    if 'title' in attribs:
                        tags.append('(' + attribs['title'] + ')')
                    self.in_a_link = True
                else:
                    text.append('%')
                    tags.append('%')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!' + self.check_halign(style)
                txt += self.check_valign(style)
                txt += attribs['src']
                text.append(txt)
                if 'alt' in attribs:
                    txt = attribs['alt']
                    if txt != '':
                        text.append('(' + txt + ')')
                tags.append('!')
        elif tag in ('ol', 'ul'):
            self.list.append({'name': tag, 'num': 0})
            text.append('')
            tags.append(tag)
        elif tag == 'li':
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            text.append('\n')
            if li['name'] == 'ul':
                text.append('*' * len(self.list) + ' ')
            elif li['name'] == 'ol':
                text.append('#' * len(self.list) + ' ')
            tags.append('')
        elif tag == 'dl':
            text.append('\n')
            tags.append('')
        elif tag == 'dt':
            text.append('')
            tags.append('\n')
        elif tag == 'dd':
            text.append('    ')
            tags.append('')
        elif tag == 'dd':
            text.append('')
            tags.append('\n')
        elif tag == 'table':
            txt = self.build_block(tag, style, attribs, stylizer)
            txt += '. \n'
            if txt != '\ntable. \n':
                text.append(txt)
            else:
                text.append('\n')
            tags.append('')
        elif tag == 'tr':
            txt = self.build_block('', style, attribs, stylizer)
            txt += '. '
            if txt != '\n. ':
                txt = re.sub('\n', '', txt)
                text.append(txt)
            tags.append('|\n')
        elif tag == 'td':
            text.append('|')
            txt = ''
            txt += self.check_halign(style)
            txt += self.check_valign(style)
            if 'colspan' in attribs:
                txt += '\\' + attribs['colspan']
            if 'rowspan' in attribs:
                txt += '/' + attribs['rowspan']
            txt += self.check_styles(style)
            if txt != '':
                text.append(txt + '. ')
            tags.append('')
        elif tag == 'th':
            text.append('|_. ')
            tags.append('')
        elif tag == 'span':
            if style['font-variant'] == 'small-caps':
                if self.style_smallcap == False:  # noqa
                    text.append('&')
                    tags.append('&')
                    self.style_smallcap = True
            else:
                if self.in_a_link == False:  # noqa
                    txt = '%'
                    if self.opts.keep_links:
                        txt += self.check_id_tag(attribs)
                        txt += self.check_styles(style)
                    if txt != '%':
                        text.append(txt)
                        tags.append('%')

        if self.opts.keep_links and 'id' in attribs:
            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                           'p', 'span', 'table'):
                text.append(self.check_id_tag(attribs))

        # Process the styles for any that we want to keep
        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p',
                       'hr', 'a', 'img', 'span', 'table', 'tr', 'td'):
            if not self.in_a_link:
                text.append(self.check_styles(style))

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if not self.in_pre:
                txt = self.prepare_string_for_textile(
                    self.remove_newlines(txt))
            text.append(txt)
            self.id_no_text = u''

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', 'li', 'table'):
                if t == 'pre':
                    self.in_pre = False
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    if not self.list:
                        text.append('\n')
            else:
                if t == 'a':
                    self.in_a_link = False
                    t = ''
                text.append(self.id_no_text)
                self.id_no_text = u''
                if t in ('*]', '*'):
                    self.style_bold = False
                elif t in ('_]', '_'):
                    self.style_italic = False
                elif t == '+]':
                    self.style_under = False
                elif t == '-]':
                    self.style_strike = False
                elif t == '&':
                    self.style_smallcap = False
                if t in ('*]', '_]', '+]', '-]', '*', '_'):
                    txt = self.style_embed.pop()
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict(
        ) and style['margin-bottom'] != 'auto':
            ems = min(
                int(round((float(style.marginBottom) / style.fontSize) - 1)),
                self.MAX_EM)
            if ems >= 1:
                text.append(u'\n\n\xa0' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if not self.in_pre:
                tail = self.prepare_string_for_textile(
                    self.remove_newlines(tail))
            text.append(tail)

        return text
Example #3
0
    def dump_text(self, elem, stylizer):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        '''

        # We can only processes tags. If there isn't a tag return any text.
        if not isinstance(elem.tag, (str, bytes)) \
           or namespace(elem.tag) != const.XHTML_NS:
            p = elem.getparent()
            if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
                    and elem.tail:
                return [elem.tail]
            return ['']

        # Setup our variables.
        text = []
        style = stylizer.style(elem)
        tags = []
        tag = barename(elem.tag)
        attribs = elem.attrib

        # Ignore anything that is set to not be displayed.
        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        # Soft scene breaks.
        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
            if ems >= 1:
                text.append(u'\n\n' * ems)

        bq = '> ' * self.blockquotes
        # Block level elements
        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
            h_tag = ''
            if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
                h_tag = '#' * int(tag[1]) + ' '
            text.append('\n' + bq + h_tag)
            tags.append('\n')
            self.remove_space_after_newline = True

        if style['font-style'] == 'italic' or tag in ('i', 'em'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
                if self.style_italic == False:  # noqa
                    text.append('*')
                    tags.append('*')
                    self.style_italic = True
        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
                if self.style_bold == False:  # noqa
                    text.append('**')
                    tags.append('**')
                    self.style_bold = True
        if tag == 'br':
            text.append('  \n')
            self.remove_space_after_newline = True
        if tag == 'blockquote':
            self.blockquotes += 1
            tags.append('>')
            text.append('> ' * self.blockquotes)
        elif tag == 'code':
            if not self.in_pre and not self.in_code:
                text.append('`')
                tags.append('`')
                self.in_code = True
        elif tag == 'pre':
            if not self.in_pre:
                text.append('\n')
                tags.append('pre')
                self.in_pre = True
        elif tag == 'hr':
            text.append('\n* * *')
            tags.append('\n')
        elif tag == 'a':
            # Only write links with absolute (external) urls.
            if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']:
                title = ''
                if 'title' in attribs:
                    title = ' "' + attribs['title'] + '"'
                    remove_space = self.remove_space_after_newline
                    title = self.remove_newlines(title)
                    self.remove_space_after_newline = remove_space
                text.append('[')
                tags.append('](' + attribs['href'] + title + ')')
        elif tag == 'img':
            if self.opts.keep_image_references:
                txt = '!'
                if 'alt' in attribs:
                    remove_space = self.remove_space_after_newline
                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
                    self.remove_space_after_newline = remove_space
                txt += '(' + attribs['src'] + ')'
                text.append(txt)
        elif tag in ('ol', 'ul'):
            tags.append(tag)
            # Add the list to our lists of lists so we can track
            # nested lists.
            self.list.append({'name': tag, 'num': 0})
        elif tag == 'li':
            # Get the last list from our list of lists
            if self.list:
                li = self.list[-1]
            else:
                li = {'name': 'ul', 'num': 0}
            # Add a new line to start the item
            text.append('\n')
            # Add indent if we have nested lists.
            list_count = len(self.list)
            # We only care about indenting nested lists.
            if (list_count - 1) > 0:
                text.append('\t' * (list_count - 1))
            # Add blockquote if we have a blockquote in a list item.
            text.append(bq)
            # Write the proper sign for ordered and unorded lists.
            if li['name'] == 'ul':
                text.append('+ ')
            elif li['name'] == 'ol':
                li['num'] += 1
                text.append(str(li['num']) + '. ')

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            txt = elem.text
            if self.in_pre:
                txt = self.prepare_string_for_pre(txt)
            elif self.in_code:
                txt = self.remove_newlines(txt)
            else:
                txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
            text.append(txt)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer)

        # Close all open tags.
        tags.reverse()
        for t in tags:
            if t in ('pre', 'ul', 'ol', '>'):
                if t == 'pre':
                    self.in_pre = False
                    text.append('\n')
                elif t == '>':
                    self.blockquotes -= 1
                elif t in ('ul', 'ol'):
                    if self.list:
                        self.list.pop()
                    text.append('\n')
            else:
                if t == '**':
                    self.style_bold = False
                elif t == '*':
                    self.style_italic = False
                elif t == '`':
                    self.in_code = False
                text.append('%s' % t)

        # Soft scene breaks.
        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
            if ems >= 1:
                text.append(u'\n\n' * ems)

        # Add the text that is outside of the tag.
        if hasattr(elem, 'tail') and elem.tail:
            tail = elem.tail
            if self.in_pre:
                tail = self.prepare_string_for_pre(tail)
            elif self.in_code:
                tail = self.remove_newlines(tail)
            else:
                tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
            text.append(tail)

        return text