def get_text(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer text = ['<body>'] # Create main section if there are no others to create if self.opts.sectionize == 'nothing': text.append('<section>') self.section_level += 1 for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) # Start a <section> if we must sectionize each file or if the TOC references this page page_section_open = False if self.opts.sectionize == 'files' or None in self.toc.get(item.href, ()): text.append('<section>') page_section_open = True self.section_level += 1 text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) if page_section_open: text.append('</section>') self.section_level -= 1 # Close any open sections while self.section_level > 0: text.append('</section>') self.section_level -= 1 return ''.join(text) + '</body>'
def stylize_spine(self): self.stylizers = {} profile = self.context.source css = '' for item in self.items: html = item.data body = html.find(XHTML('body')) if 'style' in html.attrib: b = body.attrib.get('style', '') body.set('style', html.get('style') + ';' + b) del html.attrib['style'] bs = body.get('style', '').split(';') bs.append('margin-top: 0pt') bs.append('margin-bottom: 0pt') if float(self.context.margin_left) >= 0: bs.append('margin-left : %gpt'% float(self.context.margin_left)) if float(self.context.margin_right) >= 0: bs.append('margin-right : %gpt'% float(self.context.margin_right)) bs.extend(['padding-left: 0pt', 'padding-right: 0pt']) if self.page_break_on_body: bs.extend(['page-break-before: always']) if self.context.change_justification != 'original': bs.append('text-align: '+ self.context.change_justification) if self.body_font_family: bs.append('font-family: '+self.body_font_family) body.set('style', '; '.join(bs)) stylizer = Stylizer(html, item.href, self.oeb, self.context, profile, user_css=self.context.extra_css, extra_css=css) self.stylizers[item] = stylizer
def stylizer(self, item): ans = self.stylizer_cache.get(item, None) if ans is None: ans = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile, base_css=self.base_css) self.stylizer_cache[item] = ans return ans
def mlize(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = [u''] stylizer = Stylizer(self.item.data, self.item.href, self.oeb_book, self.opts, self.opts.output_profile) content = etree.tostring(self.item.data.find(XHTML('body')), encoding='unicode') # content = self.remove_newlines(content) trees = {} for subitem, subtitle in self.subitems: snbcTree = etree.Element("snbc") snbcHead = etree.SubElement(snbcTree, "head") etree.SubElement(snbcHead, "title").text = subtitle if self.opts and self.opts.snb_hide_chapter_name: etree.SubElement(snbcHead, "hidetitle").text = "true" etree.SubElement(snbcTree, "body") trees[subitem] = snbcTree output.append('%s%s\n\n' % (CALIBRE_SNB_BM_TAG, "")) output += self.dump_text(self.subitems, safe_xml_fromstring(content), stylizer)[0] output = self.cleanup_text(''.join(output)) subitem = '' bodyTree = trees[subitem].find(".//body") for line in output.splitlines(): pos = line.find(CALIBRE_SNB_PRE_TAG) if pos == -1: line = line.strip(' \t\n\r\u3000') else: etree.SubElement(bodyTree, "text").text = \ etree.CDATA(line[pos+len(CALIBRE_SNB_PRE_TAG):]) continue if len(line) != 0: if line.find(CALIBRE_SNB_IMG_TAG) == 0: prefix = ProcessFileName(os.path.dirname(self.item.href)) if prefix != '': etree.SubElement(bodyTree, "img").text = \ prefix + '_' + line[len(CALIBRE_SNB_IMG_TAG):] else: etree.SubElement(bodyTree, "img").text = \ line[len(CALIBRE_SNB_IMG_TAG):] elif line.find(CALIBRE_SNB_BM_TAG) == 0: subitem = line[len(CALIBRE_SNB_BM_TAG):] bodyTree = trees[subitem].find(".//body") else: if self.opts and not self.opts.snb_dont_indent_first_line: prefix = '\u3000\u3000' else: prefix = '' etree.SubElement(bodyTree, "text").text = \ etree.CDATA(unicode_type(prefix + line)) if self.opts and self.opts.snb_insert_empty_line: etree.SubElement(bodyTree, "text").text = \ etree.CDATA('') return trees
def mlize_spine(self, oeb_book): output = [''] for item in oeb_book.spine: self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(item.data.find(XHTML('body')), stylizer) output.append('\n\n') return ''.join(output)
def mangle_spine(self): id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) for item in self.oeb.spine: html = item.data relhref = item.relhref(href) etree.SubElement(html.find(XHTML('head')), XHTML('link'), rel='stylesheet', href=relhref, type=CSS_MIME) stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) self.mangle_elem(html.find(XHTML('body')), stylizer)
def get_text(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML output = [u''] for item in self.oeb_book.spine: self.log.debug('Converting %s to RocketBook HTML...' % item.href) stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output.append(self.add_page_anchor(item)) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) return ''.join(output)
def mobimlize_spine(self): 'Iterate over the spine and convert it to MOBIML' for item in self.oeb.spine: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) body = item.data.find(XHTML('body')) nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) nbody = etree.SubElement(nroot, XHTML('body')) self.current_spine_item = item self.mobimlize_elem(body, stylizer, BlockState(nbody), [FormatState()]) item.data = nroot
def get_text(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML text = [''] for item in self.oeb_book.spine: self.log.debug('Converting %s to PML markup...' % item.href) content = etree.tostring(item.data, encoding='unicode') content = self.prepare_text(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) text.append(self.add_page_anchor(item)) text += self.dump_text(content.find(XHTML('body')), stylizer, item) return ''.join(text)
def __init__(self, root, item, oeb, opts, map=HTML_MAP): self.item = item self.logger = oeb.logger self.manifest = oeb.manifest self.tags, self.tattrs = map self.buf = StringIO() self.anchors = [] self.page_breaks = [] self.is_html = is_html = map is HTML_MAP self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None self.tree_to_binary(root) self.content = self.buf.getvalue() self.ahc = self.build_ahc() if is_html else None self.aht = self.build_aht() if is_html else None
def mlize_spine(self, oeb_book): output = [ u'<html><body><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /></head>' ] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output.append('\n\n') output.append('</body></html>') return ''.join(output)
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = self.header() if 'titlepage' in self.oeb_book.guide: href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(item.data.find(XHTML('body')), stylizer) output += r'{\page }' for item in self.oeb_book.spine: self.log.debug('Converting %s to RTF markup...' % item.href) # Removing comments is needed as comments with -- inside them can # cause fromstring() to fail content = re.sub('<!--.*?-->', '', etree.tostring(item.data, encoding='unicode'), flags=re.DOTALL) content = self.remove_newlines(content) content = self.remove_tabs(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) self.currently_dumping_item = item output += self.dump_text(content.find(XHTML('body')), stylizer) output += r'{\page }' output += self.footer() output = self.insert_images(output) output = self.clean_text(output) return output
def get_cover_page(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML output = u'' if 'cover' in self.oeb_book.guide: if self.name_map.get(self.oeb_book.guide['cover'].href, None): output += '<IMG SRC="%s">' % self.name_map[self.oeb_book.guide['cover'].href] if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating cover page...') href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output
def get_cover_page(self): from calibre.ebooks.oeb.stylizer import Stylizer from calibre.ebooks.oeb.base import XHTML output = '' if 'cover' in self.oeb_book.guide: output += '\\m="cover.png"\n' self.image_hrefs[self.oeb_book.guide['cover'].href] = 'cover.png' if 'titlepage' in self.oeb_book.guide: self.log.debug('Generating title page...') href = self.oeb_book.guide['titlepage'].href item = self.oeb_book.manifest.hrefs[href] if item.spine_position is None: stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += ''.join(self.dump_text(item.data.find(XHTML('body')), stylizer, item)) return output
def mlize_spine(self, oeb_book): output = [] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) output.append('\n\n') if self.opts.htmlz_class_style == 'external': css = '<link href="style.css" rel="stylesheet" type="text/css" />' else: css = '<style type="text/css">' + self.get_css(oeb_book) + '</style>' title = '<title>%s</title>' % prepare_string_for_xml(self.book_title) output = ['<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \ [css] + [title, '</head><body>'] + output + ['</body></html>'] return ''.join(output)
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer output = [u''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) content = unicode(etree.tostring(item.data, encoding=unicode)) content = self.remove_newlines(content) content = etree.fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += '\n\n\n\n\n\n' output = u''.join(output) output = u'\n'.join(l.rstrip() for l in output.splitlines()) output = self.cleanup_text(output) return output
def mlize_spine(self): from calibre.ebooks.oeb.base import XHTML from calibre.ebooks.oeb.stylizer import Stylizer from calibre.utils.xml_parse import safe_xml_fromstring output = [u''] output.append(self.get_toc()) for item in self.oeb_book.spine: self.log.debug('Converting %s to TXT...' % item.href) for x in item.data.iterdescendants(etree.Comment): if x.text and '--' in x.text: x.text = x.text.replace('--', '__') content = etree.tostring(item.data, encoding='unicode') content = self.remove_newlines(content) content = safe_xml_fromstring(content) stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) output += self.dump_text(content.find(XHTML('body')), stylizer, item) output += '\n\n\n\n\n\n' output = ''.join(output) output = '\n'.join(l.rstrip() for l in output.splitlines()) output = self.cleanup_text(output) return output
class ReBinary(object): NSRMAP = {'': None, XML_NS: 'xml'} def __init__(self, root, item, oeb, opts, map=HTML_MAP): self.item = item self.logger = oeb.logger self.manifest = oeb.manifest self.tags, self.tattrs = map self.buf = StringIO() self.anchors = [] self.page_breaks = [] self.is_html = is_html = map is HTML_MAP self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None self.tree_to_binary(root) self.content = self.buf.getvalue() self.ahc = self.build_ahc() if is_html else None self.aht = self.build_aht() if is_html else None def write(self, *values): for value in values: if isinstance(value, (int, long)): try: value = unichr(value) except OverflowError: self.logger.warn('Unicode overflow for integer:', value) value = u'?' self.buf.write(value.encode('utf-8')) def is_block(self, style): return style['display'] not in ('inline', 'inline-block') def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, basestring): # Don't emit any comments or raw entities return nsrmap = copy.copy(nsrmap) attrib = dict(elem.attrib) style = self.stylizer.style(elem) if self.stylizer else None for key, value in elem.nsmap.items(): if value not in nsrmap or nsrmap[value] != key: xmlns = ('xmlns:' + key) if key else 'xmlns' attrib[xmlns] = value nsrmap[value] = key tag = prefixname(elem.tag, nsrmap) tag_offset = self.buf.tell() if tag == 'head': inhead = True flags = FLAG_OPENING if not elem.text and len(elem) == 0: flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] if tag in self.tags: index = self.tags[tag] self.write(index) if self.tattrs[index]: tattrs = self.tattrs[index] else: self.write(FLAG_CUSTOM, len(tag) + 1, tag) last_break = self.page_breaks[-1][0] if self.page_breaks else None if style and last_break != tag_offset \ and style['page-break-before'] in PAGE_BREAKS: self.page_breaks.append((tag_offset, list(parents))) for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): value = urlnormalize(value) path, frag = urldefrag(value) if self.item: path = self.item.abshref(path) prefix = unichr(3) if path in self.manifest.hrefs: prefix = unichr(2) value = self.manifest.hrefs[path].id if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): self.anchors.append((value, tag_offset)) elif attr.startswith('ms--'): attr = '%' + attr[4:] elif tag == 'link' and attr == 'type' and value in OEB_STYLES: value = CSS_MIME if attr in tattrs: self.write(tattrs[attr]) else: self.write(FLAG_CUSTOM, len(attr) + 1, attr) try: self.write(ATTR_NUMBER, int(value) + 1) except ValueError: self.write(len(value) + 1, value) self.write(0) old_preserve = preserve if style: preserve = (style['white-space'] in ('pre', 'pre-wrap')) xml_space = elem.get(XML('space')) if xml_space == 'preserve': preserve = True elif xml_space == 'normal': preserve = False if elem.text: if preserve: self.write(elem.text) elif len(elem) == 0 or not elem.text.isspace(): self.write(COLLAPSE.sub(' ', elem.text)) # else: de nada parents.append(tag_offset) child = cstyle = nstyle = None for next in chain(elem, [None]): if self.stylizer: nstyle = None if next is None else self.stylizer.style(next) if child is not None: if not preserve \ and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \ and child.tail and child.tail.isspace(): child.tail = None self.tree_to_binary(child, nsrmap, parents, inhead, preserve) child, cstyle = next, nstyle parents.pop() preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) if elem.tail and tag != 'html': tail = elem.tail if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): self.page_breaks.append((self.buf.tell(), list(parents))) def build_ahc(self): if len(self.anchors) > 6: self.logger.warn("More than six anchors in file %r. " "Some links may not work properly." % self.item.href) data = StringIO() data.write(unichr(len(self.anchors)).encode('utf-8')) for anchor, offset in self.anchors: data.write(unichr(len(anchor)).encode('utf-8')) data.write(anchor) data.write(pack('<I', offset)) return data.getvalue() def build_aht(self): return pack('<I', 0)
class ReBinary(object): NSRMAP = {'': None, XML_NS: 'xml'} def __init__(self, root, item, oeb, opts, map=HTML_MAP): self.item = item self.logger = oeb.logger self.manifest = oeb.manifest self.tags, self.tattrs = map self.buf = StringIO() self.anchors = [] self.page_breaks = [] self.is_html = is_html = map is HTML_MAP self.stylizer = Stylizer(root, item.href, oeb, opts) if is_html else None self.tree_to_binary(root) self.content = self.buf.getvalue() self.ahc = self.build_ahc() if is_html else None self.aht = self.build_aht() if is_html else None def write(self, *values): for value in values: if isinstance(value, (int, long)): try: value = unichr(value) except OverflowError: self.logger.warn('Unicode overflow for integer:', value) value = u'?' self.buf.write(value.encode('utf-8')) def is_block(self, style): return style['display'] not in ('inline', 'inline-block') def tree_to_binary(self, elem, nsrmap=NSRMAP, parents=[], inhead=False, preserve=False): if not isinstance(elem.tag, basestring): # Don't emit any comments or raw entities return nsrmap = copy.copy(nsrmap) attrib = dict(elem.attrib) style = self.stylizer.style(elem) if self.stylizer else None for key, value in elem.nsmap.items(): if value not in nsrmap or nsrmap[value] != key: xmlns = ('xmlns:' + key) if key else 'xmlns' attrib[xmlns] = value nsrmap[value] = key tag = prefixname(elem.tag, nsrmap) tag_offset = self.buf.tell() if tag == 'head': inhead = True flags = FLAG_OPENING if not elem.text and len(elem) == 0: flags |= FLAG_CLOSING if inhead: flags |= FLAG_HEAD if style and self.is_block(style): flags |= FLAG_BLOCK self.write(0, flags) tattrs = self.tattrs[0] if tag in self.tags: index = self.tags[tag] self.write(index) if self.tattrs[index]: tattrs = self.tattrs[index] else: self.write(FLAG_CUSTOM, len(tag)+1, tag) last_break = self.page_breaks[-1][0] if self.page_breaks else None if style and last_break != tag_offset \ and style['page-break-before'] in PAGE_BREAKS: self.page_breaks.append((tag_offset, list(parents))) for attr, value in attrib.items(): attr = prefixname(attr, nsrmap) if attr in ('href', 'src'): value = urlnormalize(value) path, frag = urldefrag(value) if self.item: path = self.item.abshref(path) prefix = unichr(3) if path in self.manifest.hrefs: prefix = unichr(2) value = self.manifest.hrefs[path].id if frag: value = '#'.join((value, frag)) value = prefix + value elif attr in ('id', 'name'): self.anchors.append((value, tag_offset)) elif attr.startswith('ms--'): attr = '%' + attr[4:] elif tag == 'link' and attr == 'type' and value in OEB_STYLES: value = CSS_MIME if attr in tattrs: self.write(tattrs[attr]) else: self.write(FLAG_CUSTOM, len(attr)+1, attr) try: self.write(ATTR_NUMBER, int(value)+1) except ValueError: self.write(len(value)+1, value) self.write(0) old_preserve = preserve if style: preserve = (style['white-space'] in ('pre', 'pre-wrap')) xml_space = elem.get(XML('space')) if xml_space == 'preserve': preserve = True elif xml_space == 'normal': preserve = False if elem.text: if preserve: self.write(elem.text) elif len(elem) == 0 or not elem.text.isspace(): self.write(COLLAPSE.sub(' ', elem.text)) # else: de nada parents.append(tag_offset) child = cstyle = nstyle = None for next in chain(elem, [None]): if self.stylizer: nstyle = None if next is None else self.stylizer.style(next) if child is not None: if not preserve \ and (inhead or not nstyle or self.is_block(cstyle) or self.is_block(nstyle)) \ and child.tail and child.tail.isspace(): child.tail = None self.tree_to_binary(child, nsrmap, parents, inhead, preserve) child, cstyle = next, nstyle parents.pop() preserve = old_preserve if not flags & FLAG_CLOSING: self.write(0, (flags & ~FLAG_OPENING) | FLAG_CLOSING, 0) if elem.tail and tag != 'html': tail = elem.tail if not preserve: tail = COLLAPSE.sub(' ', tail) self.write(tail) if style and style['page-break-after'] not in ('avoid', 'auto'): self.page_breaks.append((self.buf.tell(), list(parents))) def build_ahc(self): if len(self.anchors) > 6: self.logger.warn("More than six anchors in file %r. " "Some links may not work properly." % self.item.href) data = StringIO() data.write(unichr(len(self.anchors)).encode('utf-8')) for anchor, offset in self.anchors: data.write(unichr(len(anchor)).encode('utf-8')) data.write(anchor) data.write(pack('<I', offset)) return data.getvalue() def build_aht(self): return pack('<I', 0)