def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.utils.zipfile import ZipFile self.options = options self.log = log pages, images = [], [] toc = TOC() if file_ext == 'pmlz': log.debug('De-compressing content to temporary directory...') with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for pml in pmls: html_name = os.path.splitext( os.path.basename(pml))[0] + '.html' html_path = os.path.join(os.getcwd(), html_name) pages.append(html_name) log.debug('Processing PML item %s...', pml) ttoc = self.process_pml(pml, html_path) toc += ttoc images = self.get_images(stream, tdir, True) else: toc = self.process_pml(stream, 'index.html') pages.append('index.html') if hasattr(stream, 'name'): images = self.get_images( stream, os.path.abspath(os.path.dirname(stream.name))) # We want pages to be orded alphabetically. pages.sort() manifest_items = [] for item in pages + images: manifest_items.append((item, None)) from ebook_converter.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') if 'images/cover.png' in images: mi.cover = 'images/cover.png' opf = OPFCreator(os.getcwd(), mi) log.debug('Generating manifest...') opf.create_manifest(manifest_items) opf.create_spine(pages) opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(os.getcwd(), 'metadata.opf')
def create_opf(self, output_dir, images, toc): with directory.CurrentDir(output_dir): if 'cover.png' in images: self.mi.cover = os.path.join('images', 'cover.png') opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] for i in images: manifest.append((os.path.join('images', i), None)) opf.create_manifest(manifest) opf.create_spine(['index.html']) opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(output_dir, 'metadata.opf')
def create_opf(self, output_dir, images): with directory.CurrentDir(output_dir): opf = OPFCreator(output_dir, self.mi) manifest = [('index.html', None)] for i in images: manifest.append((os.path.join('images/', i), None)) opf.create_manifest(manifest) opf.create_spine(['index.html']) with open('metadata.opf', 'wb') as opffile: opf.render(opffile) return os.path.join(output_dir, 'metadata.opf')
def __call__(self, stream, odir, log): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.metadata.odt import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator if not os.path.exists(odir): os.makedirs(odir) with directory.CurrentDir(odir): log.info('Extracting ODT file...') stream.seek(0) mi = get_metadata(stream, 'odt') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] self.filter_load(stream, mi, log) # NOTE(gryf): Here is a workaround for ODF2XHTML.xhtml() method, # which expects, that all lines are strings. html = ''.join([str(l) for l in self.lines]) # A blanket img specification like this causes problems # with EPUB output as the containing element often has # an absolute height and width set that is larger than # the available screen real estate html = html.replace('img { width: 100%; height: 100%; }', '') # odf2xhtml creates empty title tag html = html.replace('<title></title>', '<title>%s</title>' % (mi.title, )) try: html = self.fix_markup(html, log) except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: f.write(polyglot.as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) opf.create_manifest([(os.path.abspath(os.path.join(r, f2)), None) for r, _, fnames in os.walk(os.getcwd()) for f2 in fnames]) opf.create_spine([os.path.abspath('index.xhtml')]) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.abspath('metadata.opf')
def write(self, doc): toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace) raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub) if css: with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f: f.write(css.encode('utf-8')) opf = OPFCreator(self.dest_dir, self.mi) opf.toc = toc opf.create_manifest_from_files_in([self.dest_dir]) for item in opf.manifest: if item.media_type == 'text/html': item.media_type = mimetypes.guess_type('a.xhtml')[0] opf.create_spine(['index.html']) if self.cover_image is not None: opf.guide.set_cover(self.cover_image) def process_guide(E, guide): if self.toc_anchor is not None: guide.append( E.reference(href='index.html#' + self.toc_anchor, title='Table of Contents', type='toc')) toc_file = os.path.join(self.dest_dir, 'toc.ncx') with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx: opf.render(of, ncx, 'toc.ncx', process_guide=process_guide) if os.path.getsize(toc_file) == 0: os.remove(toc_file) return os.path.join(self.dest_dir, 'metadata.opf')
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml log.debug('Converting file to html...') # The main html file will be named index.html self.opts, self.log = options, log if options.new_pdf_engine: return self.convert_new(stream, accelerators) pdftohtml(os.getcwd(), stream.name, options.no_images) from ebook_converter.ebooks.metadata.meta import get_metadata log.debug('Retrieving document metadata...') mi = get_metadata(stream, 'pdf') opf = OPFCreator(os.getcwd(), mi) manifest = [('index.html', None)] images = os.listdir(os.getcwd()) images.remove('index.html') for i in images: manifest.append((i, None)) log.debug('Generating manifest...') opf.create_manifest(manifest) opf.create_spine(['index.html']) log.debug('Rendering manifest...') with open('metadata.opf', 'wb') as opffile: opf.render(opffile) if os.path.exists('toc.ncx'): ncxid = opf.manifest.id_for_path('toc.ncx') if ncxid: with open('metadata.opf', 'r+b') as f: raw = f.read().replace( b'<spine', b'<spine toc="%s"' % polyglot.as_bytes(ncxid)) f.seek(0) f.write(raw) return os.path.join(os.getcwd(), 'metadata.opf')
def opf_writer(path, opf_name, manifest, spine, mi): opf = OPFCreator(path, mi) opf.create_manifest(manifest) opf.create_spine(spine) with open(os.path.join(path, opf_name), 'wb') as opffile: opf.render(opffile)
def convert(self, stream, opts, file_ext, log, accelerators): from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.toc import TOC self.opts, self.log = opts, log if file_ext == 'cbc': comics_ = self.get_comics_from_collection(stream) else: comics_ = [['Comic', os.path.abspath(stream.name)]] stream.close() comics = [] for i, x in enumerate(comics_): title, fname = x cdir = 'comic_%d' % (i + 1) if len(comics_) > 1 else '.' cdir = os.path.abspath(cdir) if not os.path.exists(cdir): os.makedirs(cdir) pages = self.get_pages(fname, cdir) if not pages: continue if self.for_viewer: comics.append( (title, pages, [self.create_viewer_wrapper(pages)])) else: wrappers = self.create_wrappers(pages) comics.append((title, pages, wrappers)) if not comics: raise ValueError('No comic pages found in %s' % stream.name) mi = MetaInformation( os.path.basename(stream.name).rpartition('.')[0], ['Unknown']) opf = OPFCreator(os.getcwd(), mi) entries = [] def href(x): if len(comics) == 1: return os.path.basename(x) return '/'.join(x.split(os.sep)[-2:]) cover_href = None for comic in comics: pages, wrappers = comic[1:] page_entries = [(x, None) for x in map(href, pages)] entries += [(w, None) for w in map(href, wrappers)] + page_entries if cover_href is None and page_entries: cover_href = page_entries[0][0] opf.create_manifest(entries) spine = [] for comic in comics: spine.extend(map(href, comic[2])) self._images = [] for comic in comics: self._images.extend(comic[1]) opf.create_spine(spine) if self.for_viewer and cover_href: opf.guide.set_cover(cover_href) toc = TOC() if len(comics) == 1: wrappers = comics[0][2] for i, x in enumerate(wrappers): toc.add_item(href(x), None, 'Page %d' % (i + 1), play_order=i) else: po = 0 for comic in comics: po += 1 wrappers = comic[2] stoc = toc.add_item(href(wrappers[0]), None, comic[0], play_order=po) if not opts.dont_add_comic_pages_to_toc: for i, x in enumerate(wrappers): stoc.add_item(href(x), None, 'Page %d' % (i + 1), play_order=po) po += 1 opf.set_toc(toc) with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n: opf.render(m, n, 'toc.ncx') return os.path.abspath('metadata.opf')
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.rtf2xml.ParseRtf import \ RtfInvalidCodeException from ebook_converter.ebooks.rtf.input import InlineClass self.opts = options self.log = log self.log('Converting RTF to XML...') try: xml = self.generate_xml(stream.name) except RtfInvalidCodeException as e: self.log.exception('Unable to parse RTF') raise ValueError('This RTF file has a feature calibre does not ' 'support. Convert it to HTML first and then try ' 'it.\n%s' % e) d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf')) if d: imap = {} try: imap = self.extract_images(d[0]) except Exception: self.log.exception('Failed to extract images...') self.log('Parsing XML...') doc = etree.fromstring(xml) border_styles = self.convert_borders(doc) for pict in doc.xpath( '//rtf:pict[@num]', namespaces={'rtf': 'http://rtf2xml.sourceforge.net/'}): num = int(pict.get('num')) name = imap.get(num, None) if name is not None: pict.set('num', name) self.log('Converting XML to HTML...') inline_class = InlineClass(self.log) with open( pkg_resources.resource_filename('ebook_converter', 'data/rtf.xsl')) as fobj: styledoc = etree.fromstring(fobj.read()) extensions = {('calibre', 'inline-class'): inline_class} transform = etree.XSLT(styledoc, extensions=extensions) result = transform(doc) html = u'index.xhtml' with open(html, 'wb') as f: res = as_bytes(transform.tostring(result)) # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:] # clean multiple \n res = re.sub(b'\n+', b'\n', res) # Replace newlines inserted by the 'empty_paragraphs' option in # rtf2xml with html blank lines # res = re.sub('\s*<body>', '<body>', res) # res = re.sub('(?<=\n)\n{2}', # u'<p>\u00a0</p>\n'.encode('utf-8'), res) f.write(res) self.write_inline_css(inline_class, border_styles) stream.seek(0) mi = get_metadata(stream, 'rtf') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] opf = OPFCreator(os.getcwd(), mi) opf.create_manifest([(u'index.xhtml', None)]) opf.create_spine([u'index.xhtml']) opf.render(open(u'metadata.opf', 'wb')) return os.path.abspath(u'metadata.opf')
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.fb2 import ensure_namespace from ebook_converter.ebooks.metadata.fb2 import get_fb2_data from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.meta import get_metadata from ebook_converter.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)[0] try: doc = etree.fromstring(raw) except etree.XMLSyntaxError: doc = etree.fromstring(raw.replace('& ', '&')) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) try: fb_ns = doc.nsmap[doc.prefix] except Exception: fb_ns = FB2NS NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS} stylesheets = doc.xpath('//*[local-name() = "stylesheet" and ' '@type="text/css"]') css = '' for s in stylesheets: css += etree.tostring( s, encoding='unicode', method='text', with_tail=False) + '\n\n' if css: import css_parser import logging parser = css_parser.CSSParser(fetcher=None, log=logging.getLogger('calibre.css')) XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) stylesheet.namespaces['h'] = const.XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') css = css.replace('h|style', 'h|span') css = re.sub(r'name\s*=\s*', 'class=', css) self.extract_embedded_content(doc) log.debug('Converting XML to HTML...') with open( pkg_resources.resource_filename('ebook_converter', 'data/fb2.xsl')) as f: ss = f.read() ss = ss.replace("__FB_NS__", fb_ns) if options.no_inline_fb2_toc: log.info('Disabling generation of inline FB2 TOC') ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->', re.DOTALL).sub('', ss) styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite notes = { a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#') } cites = { a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '') } all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.items(): note = notes.get(cite, None) if note: c = 1 while 'cite%d' % c in all_ids: c += 1 if not note.get('id', None): note.set('id', 'cite%d' % c) all_ids.add(note.get('id')) a.set('href', '#%s' % note.get('id')) for x in result.xpath('//*[@link_note or @link_cite]'): x.attrib.pop('link_note', None) x.attrib.pop('link_cite', None) for img in result.xpath('//img[@src]'): src = img.get('src') img.set('src', self.binary_map.get(src, src)) index = transform.tostring(result) with open('index.xhtml', 'wb') as f: f.write(index.encode('utf-8')) with open('inline-styles.css', 'wb') as f: f.write(css.encode('utf-8')) stream.seek(0) mi = get_metadata(stream, 'fb2') if not mi.title: mi.title = 'Unknown' if not mi.authors: mi.authors = ['Unknown'] cpath = None if mi.cover_data and mi.cover_data[1]: with open('fb2_cover_calibre_mi.jpg', 'wb') as f: f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): href = img.get('{%s}href' % const.XLINK_NS, img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] cpath = os.path.abspath(href) break opf = OPFCreator(os.getcwd(), mi) entries = [(f2, mimetypes.guess_type(f2)[0]) for f2 in os.listdir(u'.')] opf.create_manifest(entries) opf.create_spine(['index.xhtml']) if cpath: opf.guide.set_cover(cpath) with open('metadata.opf', 'wb') as f: opf.render(f) return os.path.join(os.getcwd(), 'metadata.opf')
def create_opf(self, htmlfile, guide=None, root=None): mi = getattr(self.book_header.exth, 'mi', self.embedded_mi) if mi is None: mi = MetaInformation(self.book_header.title, ['Unknown']) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): opf.cover = 'images/%05d.jpg' % (self.book_header .exth.cover_offset + 1) elif mi.cover is not None: opf.cover = mi.cover else: opf.cover = 'images/%05d.jpg' % 1 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), * opf.cover.split('/'))): opf.cover = None cover = opf.cover cover_copied = None if cover is not None: cover = cover.replace('/', os.sep) if os.path.exists(cover): ncover = 'images'+os.sep+'calibre_cover.jpg' if os.path.exists(ncover): os.remove(ncover) shutil.copyfile(cover, ncover) cover_copied = os.path.abspath(ncover) opf.cover = ncover.replace(os.sep, '/') manifest = [(htmlfile, 'application/xhtml+xml'), (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) added = set() for i in getattr(self, 'image_names', []): path = os.path.join(bp, 'images', i) added.add(path) manifest.append((path, mimetypes.guess_type(path)[0] or 'image/jpeg')) if cover_copied is not None: manifest.append((cover_copied, 'image/jpeg')) opf.create_manifest(manifest) opf.create_spine([os.path.basename(htmlfile)]) toc = None if guide is not None: opf.create_guide(guide) for ref in opf.guide: if ref.type.lower() == 'toc': toc = ref.href() ncx_manifest_entry = None if toc: ncx_manifest_entry = 'toc.ncx' elems = root.xpath('//*[@id="%s"]' % toc.partition('#')[-1]) tocobj = None ent_pat = re.compile(r'&(\S+?);') if elems: tocobj = TOC() found = False reached = False for x in root.iter(): if x == elems[-1]: reached = True continue if reached and x.tag == 'a': href = x.get('href', '') if href and re.match(r'\w+://', href) is None: try: text = ' '.join([t.strip() for t in x.xpath('descendant:' ':text()')]) except Exception: text = '' text = ent_pat.sub(entities.entity_to_unicode, text) item = tocobj.add_item(toc.partition('#')[0], href[1:], text) item.left_space = int(self.get_left_whitespace(x)) found = True if (reached and found and x.get('class', None) == 'mbp_pagebreak'): break if tocobj is not None: tocobj = self.structure_toc(tocobj) opf.set_toc(tocobj) return opf, ncx_manifest_entry
def write_opf(self, guide, toc, spine, resource_map): mi = self.header.exth.mi if (self.cover_offset is not None and self.cover_offset < len(resource_map)): mi.cover = resource_map[self.cover_offset] if len(list(toc)) < 2: self.log.warning('KF8 has no metadata Table of Contents') for ref in guide: if ref.type == 'toc': href = ref.href() href, frag = urllib.parse.urldefrag(href) if os.path.exists(href.replace('/', os.sep)): try: toc = self.read_inline_toc(href, frag) except Exception: self.log.exception('Failed to read inline ToC') opf = OPFCreator(os.getcwd(), mi) opf.guide = guide def exclude(path): return os.path.basename(path) == 'debug-raw.html' # If there are no images then the azw3 input plugin dumps all # binary records as .unknown images, remove them if (self.for_tweak and os.path.exists('images') and os.path.isdir('images')): files = os.listdir('images') unknown = [x for x in files if x.endswith('.unknown')] if len(files) == len(unknown): [os.remove('images/' + f) for f in files] if self.for_tweak: try: os.remove('debug-raw.html') except Exception: pass opf.create_manifest_from_files_in([os.getcwd()], exclude=exclude) for entry in opf.manifest: if entry.mime_type == 'text/html': entry.mime_type = 'application/xhtml+xml' opf.create_spine(spine) opf.set_toc(toc) ppd = getattr(self.header.exth, 'page_progression_direction', None) if ppd in {'ltr', 'rtl', 'default'}: opf.page_progression_direction = ppd pwm = getattr(self.header.exth, 'primary_writing_mode', None) if pwm is not None: opf.primary_writing_mode = pwm with open('metadata.opf', 'wb') as of, open('toc.ncx', 'wb') as ncx: opf.render(of, ncx, 'toc.ncx') return 'metadata.opf'