def _metadata_from_opf(self, opf): from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata stream = io.BytesIO( etree.tostring(opf, xml_declaration=True, encoding='utf-8')) o = OPF(stream) pwm = o.primary_writing_mode if pwm: self.oeb.metadata.primary_writing_mode = pwm mi = o.to_book_metadata() if not mi.language: mi.language = get_lang().replace('_', '-') self.oeb.metadata.add('language', mi.language) if not mi.book_producer: mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' % dict(a=__appname__, v=__version__)) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) m = self.oeb.metadata m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') self.oeb.uid = self.oeb.metadata.identifier[-1] if not m.title: m.add('title', self.oeb.translate('Unknown')) has_aut = False for x in m.creator: if getattr(x, 'role', '').lower() in ('', 'aut'): has_aut = True break if not has_aut: m.add('creator', self.oeb.translate('Unknown'), role='aut')
def read_embedded_metadata(self, root, elem, guide): raw = b'<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \ html.tostring(elem, encoding='utf-8') + b'</package>' stream = io.BytesIO(raw) opf = OPF(stream) self.embedded_mi = opf.to_book_metadata() if guide is not None: for ref in guide.xpath('descendant::reference'): if 'cover' in ref.get('type', '').lower(): href = ref.get('href', '') if href.startswith('#'): href = href[1:] anchors = root.xpath('//*[@id="%s"]' % href) if anchors: cpos = anchors[0] reached = False for elem in root.iter(): if elem is cpos: reached = True if reached and elem.tag == 'img': cover = elem.get('src', None) self.embedded_mi.cover = cover elem.getparent().remove(elem) break break
def opf_metadata(opfpath): if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', os.getcwd()) else: f = open(opfpath, 'rb') try: opf = OPF(f, os.path.dirname(opfpath)) if opf.application_id is not None: mi = opf.to_book_metadata() if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) if os.access(cpath, os.R_OK): fmt = cpath.rpartition('.')[-1] with open(cpath, 'rb') as f: data = f.read() mi.cover_data = (fmt, data) return mi except Exception: import traceback traceback.print_exc() pass
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn('Multiple HTML files found in the archive. Only %s will ' 'be used.' % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception('No top level HTML file found.') if not html: raise Exception('Top level HTML file %s is empty' % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, mimetypes.guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from ebook_converter.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from ebook_converter.ebooks.metadata.opf2 import OPF, metadata_to_opf from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from ebook_converter.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(str(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, str): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from ebook_converter.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with open(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)