def __enter__(self, *args): """ Add this plugin to the python path so that it's contents become directly importable. Useful when bundling large python libraries into the plugin. Use it like this:: with plugin: import something """ if self.plugin_path is not None: from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(self.plugin_path) extensions = {x.rpartition('.')[-1].lower() for x in zf.namelist()} zip_safe = True for ext in ('pyd', 'so', 'dll', 'dylib'): if ext in extensions: zip_safe = False break if zip_safe: sys.path.insert(0, self.plugin_path) self.sys_insertion_path = self.plugin_path else: from ebook_converter.ptempfile import TemporaryDirectory self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip') self.sys_insertion_path = (self._sys_insertion_tdir. __enter__(*args)) zf.extractall(self.sys_insertion_path) sys.path.insert(0, self.sys_insertion_path) zf.close()
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.utils.zipfile import ZipFile self.options = options self.log = log pages, images = [], [] toc = TOC() if file_ext == 'pmlz': log.debug('De-compressing content to temporary directory...') with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for pml in pmls: html_name = os.path.splitext( os.path.basename(pml))[0] + '.html' html_path = os.path.join(os.getcwd(), html_name) pages.append(html_name) log.debug('Processing PML item %s...', pml) ttoc = self.process_pml(pml, html_path) toc += ttoc images = self.get_images(stream, tdir, True) else: toc = self.process_pml(stream, 'index.html') pages.append('index.html') if hasattr(stream, 'name'): images = self.get_images( stream, os.path.abspath(os.path.dirname(stream.name))) # We want pages to be orded alphabetically. pages.sort() manifest_items = [] for item in pages + images: manifest_items.append((item, None)) from ebook_converter.ebooks.metadata.meta import get_metadata log.debug('Reading metadata from input file...') mi = get_metadata(stream, 'pml') if 'images/cover.png' in images: mi.cover = 'images/cover.png' opf = OPFCreator(os.getcwd(), mi) log.debug('Generating manifest...') opf.create_manifest(manifest_items) opf.create_spine(pages) opf.set_toc(toc) with open('metadata.opf', 'wb') as opffile: with open('toc.ncx', 'wb') as tocfile: opf.render(opffile, tocfile, 'toc.ncx') return os.path.join(os.getcwd(), 'metadata.opf')
def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except Exception: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn('Multiple HTML files found in the archive. Only %s will ' 'be used.' % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception('No top level HTML file found.') if not html: raise Exception('Top level HTML file %s is empty' % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, mimetypes.guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, recipe_or_file, opts, file_ext, log, accelerators): from ebook_converter.web.feeds.recipes import compile_recipe opts.output_profile.flow_size = 0 if file_ext == 'downloaded_recipe': from ebook_converter.utils.zipfile import ZipFile zf = ZipFile(recipe_or_file, 'r') zf.extractall() zf.close() with open('download.recipe', 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) recipe.needs_subscription = False self.recipe_object = recipe(opts, log, self.report_progress) else: if os.environ.get('CALIBRE_RECIPE_URN'): from ebook_converter.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id urn = os.environ['CALIBRE_RECIPE_URN'] log('Downloading recipe urn: ' + urn) rtype, recipe_id = urn.partition(':')[::2] if not recipe_id: raise ValueError('Invalid recipe urn: ' + urn) if rtype == 'custom': self.recipe_source = get_custom_recipe(recipe_id) else: self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True) if not self.recipe_source: raise ValueError('Could not find recipe with urn: ' + urn) if not isinstance(self.recipe_source, bytes): self.recipe_source = self.recipe_source.encode('utf-8') recipe = compile_recipe(self.recipe_source) elif os.access(recipe_or_file, os.R_OK): with open(recipe_or_file, 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) log('Using custom recipe') else: from ebook_converter.web.feeds.recipes.collection import ( get_builtin_recipe_by_title, get_builtin_recipe_titles) title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = os.path.basename(title).rpartition('.')[0] titles = frozenset(get_builtin_recipe_titles()) if title not in titles: title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = title.rpartition('.')[0] raw = get_builtin_recipe_by_title(title, log=log, download_recipe=not opts.dont_download_recipe) builtin = False try: recipe = compile_recipe(raw) self.recipe_source = raw if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % ('.'.join(recipe.requires_version))) builtin = True except: log.exception('Failed to compile downloaded recipe. Falling ' 'back to builtin one') builtin = True if builtin: log('Using bundled builtin recipe') raw = get_builtin_recipe_by_title(title, log=log, download_recipe=False) if raw is None: raise ValueError('Failed to find builtin recipe: '+title) recipe = compile_recipe(raw) self.recipe_source = raw else: log('Using downloaded builtin recipe') if recipe is None: raise ValueError('%r is not a valid recipe file or builtin recipe' % recipe_or_file) disabled = getattr(recipe, 'recipe_disabled', None) if disabled is not None: raise RecipeDisabled(disabled) ro = recipe(opts, log, self.report_progress) ro.download() self.recipe_object = ro for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): if f.endswith('.opf'): return os.path.abspath(f) for f in walk('.'): if f.endswith('.opf'): return os.path.abspath(f)
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks import DRMError _path_or_stream = getattr(stream, 'name', 'stream') try: zf = ZipFile(stream) zf.extractall(os.getcwd()) except Exception: log.exception('EPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for root, _, fnames in os.walk('.'): for f in fnames: f = os.path.join(root, f) if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break if opf is None: raise ValueError('%s is not a valid EPUB file (could not find ' 'opf)' % _path_or_stream) opf = os.path.relpath(opf, os.getcwd()) parts = os.path.split(opf) opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(_path_or_stream)) self.encrypted_fonts = self._encrypted_font_uris # NOTE(gryf): check if opf is nested on the directory(ies), if so, # update the links for guide and manifest. if len(parts) > 1 and parts[0]: path = os.path.join(parts[0]) for elem in opf.itermanifest(): elem.set('href', os.path.join(path, elem.get('href'))) for elem in opf.iterguide(): elem.set('href', os.path.join(path, elem.get('href'))) if opf.package_version >= 3.0: f = self.rationalize_cover3 else: f = self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover, ) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath('content.opf')
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from ebook_converter.ebooks.chardet import detect from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = self.output_dir = os.getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for root, _, fnames in os.walk('.'): for x in fnames: x = os.path.join(root, x) if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info( 'File extension indicates particular formatting. ' 'Forcing formatting type to: %s', options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s', ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of ' '%s%%', ienc, confidence * 100) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect ' 'using %s', ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using ' 'block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s', options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s', options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from ebook_converter.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file('index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from ebook_converter.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb