def postprocess_book(self, oeb, opts, log): from ebook_converter.ebooks.oeb.base import XHTML, barename for item in oeb.spine: if hasattr(item.data, 'xpath'): for heading in item.data.iterdescendants( *map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())): if not len(heading): continue span = heading[0] if not heading.text and not span.text and not len( span) and barename(span.tag) == 'span': if not heading.get('id') and span.get('id'): heading.set('id', span.get('id')) heading.text = span.tail heading.remove(span) if len(heading) == 1 and heading[0].get( 'style') == 'text-align: center; margin: auto;': div = heading[0] if barename(div.tag) == 'div' and not len( div) and not div.get('id') and not heading.get( 'style'): heading.text = (heading.text or '') + ( div.text or '') + (div.tail or '') heading.remove(div) heading.set('style', 'text-align: center')
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM) if ems >= 1: text.append(u'\n\n\xa0' * ems) if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): if tag == 'div': tag = 'p' text.append(self.build_block(tag, style, attribs, stylizer)) text.append('. ') tags.append('\n') if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: # noqa if self.in_a_link: text.append('_') tags.append('_') else: text.append('[_') tags.append('_]') self.style_embed.append('_') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: # noqa if self.in_a_link: text.append('*') tags.append('*') else: text.append('[*') tags.append('*]') self.style_embed.append('*') self.style_bold = True if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): if tag != 'a': if self.style_under == False: # noqa text.append('[+') tags.append('+]') self.style_embed.append('+') self.style_under = True if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): if self.style_strike == False: # noqa text.append('[-') tags.append('-]') self.style_embed.append('-') self.style_strike = True if tag == 'br': for i in reversed(self.style_embed): text.append(i) text.append('\n') for i in self.style_embed: text.append(i) tags.append('') self.remove_space_after_newline = True if tag == 'blockquote': text.append('\nbq. ') tags.append('\n') elif tag in ('abbr', 'acronym'): text.append('') txt = attribs['title'] tags.append('(' + txt + ')') elif tag == 'sup': text.append('^') tags.append('^') elif tag == 'sub': text.append('~') tags.append('~') elif tag == 'code': if self.in_pre: text.append('\nbc. ') tags.append('') else: text.append('@') tags.append('@') elif tag == 'cite': text.append('??') tags.append('??') elif tag == 'hr': text.append('\n***') tags.append('\n') elif tag == 'pre': self.in_pre = True text.append('\npre. ') tags.append('pre\n') elif tag == 'a': if self.opts.keep_links: if 'href' in attribs: text.append('"') tags.append('a') tags.append('":' + attribs['href']) self.our_links.append(attribs['href']) if 'title' in attribs: tags.append('(' + attribs['title'] + ')') self.in_a_link = True else: text.append('%') tags.append('%') elif tag == 'img': if self.opts.keep_image_references: txt = '!' + self.check_halign(style) txt += self.check_valign(style) txt += attribs['src'] text.append(txt) if 'alt' in attribs: txt = attribs['alt'] if txt != '': text.append('(' + txt + ')') tags.append('!') elif tag in ('ol', 'ul'): self.list.append({'name': tag, 'num': 0}) text.append('') tags.append(tag) elif tag == 'li': if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} text.append('\n') if li['name'] == 'ul': text.append('*' * len(self.list) + ' ') elif li['name'] == 'ol': text.append('#' * len(self.list) + ' ') tags.append('') elif tag == 'dl': text.append('\n') tags.append('') elif tag == 'dt': text.append('') tags.append('\n') elif tag == 'dd': text.append(' ') tags.append('') elif tag == 'dd': text.append('') tags.append('\n') elif tag == 'table': txt = self.build_block(tag, style, attribs, stylizer) txt += '. \n' if txt != '\ntable. \n': text.append(txt) else: text.append('\n') tags.append('') elif tag == 'tr': txt = self.build_block('', style, attribs, stylizer) txt += '. ' if txt != '\n. ': txt = re.sub('\n', '', txt) text.append(txt) tags.append('|\n') elif tag == 'td': text.append('|') txt = '' txt += self.check_halign(style) txt += self.check_valign(style) if 'colspan' in attribs: txt += '\\' + attribs['colspan'] if 'rowspan' in attribs: txt += '/' + attribs['rowspan'] txt += self.check_styles(style) if txt != '': text.append(txt + '. ') tags.append('') elif tag == 'th': text.append('|_. ') tags.append('') elif tag == 'span': if style['font-variant'] == 'small-caps': if self.style_smallcap == False: # noqa text.append('&') tags.append('&') self.style_smallcap = True else: if self.in_a_link == False: # noqa txt = '%' if self.opts.keep_links: txt += self.check_id_tag(attribs) txt += self.check_styles(style) if txt != '%': text.append(txt) tags.append('%') if self.opts.keep_links and 'id' in attribs: if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): text.append(self.check_id_tag(attribs)) # Process the styles for any that we want to keep if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', 'span', 'table', 'tr', 'td'): if not self.in_a_link: text.append(self.check_styles(style)) # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if not self.in_pre: txt = self.prepare_string_for_textile( self.remove_newlines(txt)) text.append(txt) self.id_no_text = u'' # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', 'li', 'table'): if t == 'pre': self.in_pre = False elif t in ('ul', 'ol'): if self.list: self.list.pop() if not self.list: text.append('\n') else: if t == 'a': self.in_a_link = False t = '' text.append(self.id_no_text) self.id_no_text = u'' if t in ('*]', '*'): self.style_bold = False elif t in ('_]', '_'): self.style_italic = False elif t == '+]': self.style_under = False elif t == '-]': self.style_strike = False elif t == '&': self.style_smallcap = False if t in ('*]', '_]', '+]', '-]', '*', '_'): txt = self.style_embed.pop() text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict( ) and style['margin-bottom'] != 'auto': ems = min( int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM) if ems >= 1: text.append(u'\n\n\xa0' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if not self.in_pre: tail = self.prepare_string_for_textile( self.remove_newlines(tail)) text.append(tail) return text
def dump_text(self, elem, stylizer): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [] style = stylizer.style(elem) tags = [] tag = barename(elem.tag) attribs = elem.attrib # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [''] # Soft scene breaks. if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': ems = int(round(float(style.marginTop) / style.fontSize) - 1) if ems >= 1: text.append(u'\n\n' * ems) bq = '> ' * self.blockquotes # Block level elements if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): h_tag = '' if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): h_tag = '#' * int(tag[1]) + ' ' text.append('\n' + bq + h_tag) tags.append('\n') self.remove_space_after_newline = True if style['font-style'] == 'italic' or tag in ('i', 'em'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): if self.style_italic == False: # noqa text.append('*') tags.append('*') self.style_italic = True if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): if self.style_bold == False: # noqa text.append('**') tags.append('**') self.style_bold = True if tag == 'br': text.append(' \n') self.remove_space_after_newline = True if tag == 'blockquote': self.blockquotes += 1 tags.append('>') text.append('> ' * self.blockquotes) elif tag == 'code': if not self.in_pre and not self.in_code: text.append('`') tags.append('`') self.in_code = True elif tag == 'pre': if not self.in_pre: text.append('\n') tags.append('pre') self.in_pre = True elif tag == 'hr': text.append('\n* * *') tags.append('\n') elif tag == 'a': # Only write links with absolute (external) urls. if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']: title = '' if 'title' in attribs: title = ' "' + attribs['title'] + '"' remove_space = self.remove_space_after_newline title = self.remove_newlines(title) self.remove_space_after_newline = remove_space text.append('[') tags.append('](' + attribs['href'] + title + ')') elif tag == 'img': if self.opts.keep_image_references: txt = '!' if 'alt' in attribs: remove_space = self.remove_space_after_newline txt += '[' + self.remove_newlines(attribs['alt']) + ']' self.remove_space_after_newline = remove_space txt += '(' + attribs['src'] + ')' text.append(txt) elif tag in ('ol', 'ul'): tags.append(tag) # Add the list to our lists of lists so we can track # nested lists. self.list.append({'name': tag, 'num': 0}) elif tag == 'li': # Get the last list from our list of lists if self.list: li = self.list[-1] else: li = {'name': 'ul', 'num': 0} # Add a new line to start the item text.append('\n') # Add indent if we have nested lists. list_count = len(self.list) # We only care about indenting nested lists. if (list_count - 1) > 0: text.append('\t' * (list_count - 1)) # Add blockquote if we have a blockquote in a list item. text.append(bq) # Write the proper sign for ordered and unorded lists. if li['name'] == 'ul': text.append('+ ') elif li['name'] == 'ol': li['num'] += 1 text.append(str(li['num']) + '. ') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: txt = elem.text if self.in_pre: txt = self.prepare_string_for_pre(txt) elif self.in_code: txt = self.remove_newlines(txt) else: txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) text.append(txt) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer) # Close all open tags. tags.reverse() for t in tags: if t in ('pre', 'ul', 'ol', '>'): if t == 'pre': self.in_pre = False text.append('\n') elif t == '>': self.blockquotes -= 1 elif t in ('ul', 'ol'): if self.list: self.list.pop() text.append('\n') else: if t == '**': self.style_bold = False elif t == '*': self.style_italic = False elif t == '`': self.in_code = False text.append('%s' % t) # Soft scene breaks. if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) if ems >= 1: text.append(u'\n\n' * ems) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: tail = elem.tail if self.in_pre: tail = self.prepare_string_for_pre(tail) elif self.in_code: tail = self.remove_newlines(tail) else: tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) text.append(tail) return text