def write_mobi(self, input_plugin, output_path, kf8, resources): from ebook_converter.ebooks.mobi.mobiml import MobiMLizer from ebook_converter.ebooks.oeb.transforms.manglecase import CaseMangler from ebook_converter.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from ebook_converter.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder from ebook_converter.customize.ui import plugin_for_input_format opts, oeb = self.opts, self.oeb if not opts.no_inline_toc: tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if opts.mobi_toc_at_start else 'end') tocadder(oeb, opts) mangler = CaseMangler() mangler(oeb, opts) try: rasterizer = SVGRasterizer() rasterizer(oeb, opts) except Unavailable: self.log.warn('SVG rasterizer unavailable, SVG will not be converted') else: # Add rasterized SVG images resources.add_extra_images() if hasattr(self.oeb, 'inserted_metadata_jacket'): self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket) mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') from ebook_converter.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter(opts, resources, kf8, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) extract_mobi(output_path, opts)
def convert(self, stream, options, file_ext, log, accelerators): # NOTE(gryf): for some reason, those import cannot be moved to the top # of module. from ebook_converter.ebooks.chm.metadata import get_metadata_from_reader from ebook_converter.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, str): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False # options.no_images chm_name = stream.name # chm_data = stream.read() # closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s', tdir) log.debug('stream.name=%s', stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) try: metadata = get_metadata_from_reader(self._chm_reader) except Exception: log.exception('Failed to read metadata, using filename') from ebook_converter.ebooks.metadata.book.base import Metadata metadata = Metadata(os.path.basename(chm_name)) encoding = (self._chm_reader.get_encoding() or options.input_encoding or 'cp1252') self._chm_reader.CloseCHM() options.debug_pipeline = None options.input_encoding = 'utf-8' uenc = encoding if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: uenc = 'utf-8' htmlpath, toc = self._create_html_root(mainpath, log, uenc) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi if toc.count() > 1: oeb.toc = self.parse_html_toc(oeb.spine[0]) oeb.manifest.remove(oeb.spine[0]) oeb.auto_generated_toc = False return oeb
def do_rebuild(opf, dest_path): plumber = Plumber(opf, dest_path, default_log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(default_log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
def extract_content(self, output_dir): self.log.info('Extracting PDF...') pdf = PersistentTemporaryFile('.pdf') pdf.close() pdf = open(pdf, 'wb') for x in range(self.header.section_count()): pdf.write(self.header.section_data(x)) pdf.close() from ebook_converter.customize.ui import plugin_for_input_format pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.txt.processor import convert_basic stdout = BytesIO() from ebook_converter.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) raw_text = stdout.getvalue() if not raw_text: raise ValueError('The DJVU file contains no text, only images, probably page scans.' ' calibre only supports conversion of DJVU files with actual text in them.') html = convert_basic(raw_text.replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, 'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = os.path.join(base, 'index%d.html'%c) with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.compression.tcr import decompress log.info('Decompressing text...') raw_txt = decompress(stream) log.info('Converting text to OEB...') stream = BytesIO(raw_txt) from ebook_converter.customize.ui import plugin_for_input_format txt_plugin = plugin_for_input_format('txt') for opt in txt_plugin.options: if not hasattr(self.options, opt.option.name): setattr(options, opt.option.name, opt.recommended_value) stream.seek(0) return txt_plugin.convert(stream, options, 'txt', log, accelerators)
def extract_content(self, output_dir): raw_txt = b'' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i', i) raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') stream = io.BytesIO(raw_txt) from ebook_converter.customize.ui import plugin_for_input_format txt_plugin = plugin_for_input_format('txt') for opt in txt_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) stream.seek(0) return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.metadata.opf2 import OPF from ebook_converter.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn('Multiple HTML files found in the archive. Only %s will ' 'be used.' % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception('No top level HTML file found.') if not html: raise Exception('Top level HTML file %s is empty' % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from ebook_converter.customize.ui import get_file_type_metadata from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, mimetypes.guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(open('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data(open('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with open('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb
def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from ebook_converter.ebooks.chardet import detect from ebook_converter.utils.zipfile import ZipFile from ebook_converter.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = b'' log.debug('Reading text from file...') length = 0 base_dir = self.output_dir = os.getcwd() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for root, _, fnames in os.walk('.'): for x in fnames: x = os.path.join(root, x) if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + b'\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info( 'File extension indicates particular formatting. ' 'Forcing formatting type to: %s', options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s', ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of ' '%s%%', ienc, confidence * 100) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect ' 'using %s', ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = entities.ENT_PAT.sub(entities.xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using ' 'block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s', options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s', options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from ebook_converter.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from ebook_converter.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file('index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from ebook_converter.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb