def bulk_defaults_for_input_format(fmt): plugin = plugin_for_input_format(fmt) if plugin is not None: w = config_widget_for_input_plugin(plugin) if w is not None: return load_defaults(w.COMMIT_NAME) return {}
def queue_convert_jobs(self, jobs, changed, bad, rows, previous, converted_func, extra_job_args=[], rows_are_ids=False): for func, args, desc, fmt, id, temp_files in jobs: func, _, parts = func.partition(':') parts = {x for x in parts.split(';')} input_file = args[0] input_fmt = os.path.splitext(input_file)[1] core_usage = 1 if input_fmt: input_fmt = input_fmt[1:] plugin = plugin_for_input_format(input_fmt) if plugin is not None: core_usage = plugin.core_usage if id not in bad: job = self.gui.job_manager.run_job(Dispatcher(converted_func), func, args=args, description=desc, core_usage=core_usage) job.conversion_of_same_fmt = 'same_fmt' in parts job.manually_fine_tune_toc = 'manually_fine_tune_toc' in parts args = [temp_files, fmt, id]+extra_job_args self.conversion_jobs[job] = tuple(args) if changed: m = self.gui.library_view.model() if rows_are_ids: m.refresh_ids(rows) else: m.refresh_rows(rows) current = self.gui.library_view.currentIndex() self.gui.library_view.model().current_changed(current, previous)
def book_manifest(ctx, rd, book_id, fmt): db, library_id = get_library_data(ctx, rd)[:2] force_reload = rd.query.get('force_reload') == '1' if plugin_for_input_format(fmt) is None: raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper()) if book_id not in ctx.allowed_book_ids(rd, db): raise HTTPNotFound('No book with id: %s in library: %s' % (book_id, library_id)) with db.safe_read_lock: fm = db.format_metadata(book_id, fmt) if not fm: raise HTTPNotFound('No %s format for the book (id:%s) in the library: %s' % (fmt, book_id, library_id)) size, mtime = map(int, (fm['size'], time.mktime(fm['mtime'].utctimetuple())*10)) bhash = book_hash(db.library_id, book_id, fmt, size, mtime) with cache_lock: mpath = abspath(os.path.join(books_cache_dir(), 'f', bhash, 'calibre-book-manifest.json')) if force_reload: safe_remove(mpath, True) try: os.utime(mpath, None) with lopen(mpath, 'rb') as f: ans = jsonlib.load(f) ans['metadata'] = book_as_json(db, book_id) return ans except EnvironmentError as e: if e.errno != errno.ENOENT: raise x = failed_jobs.pop(bhash, None) if x is not None: return {'aborted':x[0], 'traceback':x[1], 'job_status':'finished'} job_id = queued_jobs.get(bhash) if job_id is None: job_id = queue_job(ctx, partial(db.copy_format_to, book_id, fmt), bhash, fmt, book_id, size, mtime) status, result, tb, aborted = ctx.job_status(job_id) return {'aborted': aborted, 'traceback':tb, 'job_status':status, 'job_id':job_id}
def write_mobi(self, input_plugin, output_path, kf8, resources): from calibre.ebooks.mobi.mobiml import MobiMLizer from calibre.ebooks.oeb.transforms.manglecase import CaseMangler from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder from calibre.customize.ui import plugin_for_input_format opts, oeb = self.opts, self.oeb if not opts.no_inline_toc: tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if opts.mobi_toc_at_start else 'end') tocadder(oeb, opts) mangler = CaseMangler() mangler(oeb, opts) try: rasterizer = SVGRasterizer() rasterizer(oeb, opts) except Unavailable: self.log.warn('SVG rasterizer unavailable, SVG will not be converted') else: # Add rasterized SVG images resources.add_extra_images() if hasattr(self.oeb, 'inserted_metadata_jacket'): self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket) mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables) mobimlizer(oeb, opts) write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz') from calibre.ebooks.mobi.writer2.main import MobiWriter writer = MobiWriter(opts, resources, kf8, write_page_breaks_after_item=write_page_breaks_after_item) writer(oeb, output_path) extract_mobi(output_path, opts)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = StringIO() ppdjvu = True # using djvutxt is MUCH faster, should make it an option if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'): from calibre.ptempfile import PersistentTemporaryFile try: fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input') filename = fp._name fp.write(stream.read()) fp.close() cmd = ['djvutxt', filename] stdout.write( Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0]) os.remove(filename) ppdjvu = False except: stream.seek(0) # retry with the pure python converter if ppdjvu: from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def book_manifest(ctx, rd, book_id, fmt): db, library_id = get_library_data(ctx, rd)[:2] if plugin_for_input_format(fmt) is None: raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper()) if book_id not in ctx.allowed_book_ids(rd, db): raise HTTPNotFound('No book with id: %s in library: %s' % (book_id, library_id)) with db.safe_read_lock: fm = db.format_metadata(book_id, fmt) if not fm: raise HTTPNotFound('No %s format for the book (id:%s) in the library: %s' % (fmt, book_id, library_id)) size, mtime = map(int, (fm['size'], time.mktime(fm['mtime'].utctimetuple())*10)) bhash = book_hash(db.library_id, book_id, fmt, size, mtime) with cache_lock: mpath = abspath(os.path.join(books_cache_dir(), 'f', bhash, 'calibre-book-manifest.json')) try: os.utime(mpath, None) with lopen(mpath, 'rb') as f: ans = jsonlib.load(f) ans['metadata'] = book_as_json(db, book_id) return ans except EnvironmentError as e: if e.errno != errno.ENOENT: raise x = failed_jobs.pop(bhash, None) if x is not None: return {'aborted':x[0], 'traceback':x[1], 'job_status':'finished'} job_id = queued_jobs.get(bhash) if job_id is None: job_id = queue_job(ctx, partial(db.copy_format_to, book_id, fmt), bhash, fmt, book_id, size, mtime) status, result, tb, aborted = ctx.job_status(job_id) return {'aborted': aborted, 'traceback':tb, 'job_status':status, 'job_id':job_id}
def do_print(): from calibre.customize.ui import plugin_for_input_format stdin = getattr(sys.stdin, 'buffer', sys.stdin) data = msgpack_loads(stdin.read()) ext = data['input'].lower().rpartition('.')[-1] input_plugin = plugin_for_input_format(ext) if input_plugin is None: raise ValueError('Not a supported file type: {}'.format(ext.upper())) args = [ 'ebook-convert', data['input'], data['output'], '--paper-size', data['paper_size'], '--pdf-add-toc', '--disable-remove-fake-margins', '--chapter-mark', 'none', '-vv' ] if input_plugin.is_image_collection: args.append('--no-process') else: args.append('--disable-font-rescaling') args.append('--page-breaks-before=/') if data['page_numbers']: args.append('--pdf-page-numbers') for edge in 'left top right bottom'.split(): args.append('--pdf-page-margin-' + edge), args.append( '%.1f' % (data['margin_' + edge] * 72)) from calibre.ebooks.conversion.cli import main main(args)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = StringIO() ppdjvu = True # using djvutxt is MUCH faster, should make it an option if options.use_djvutxt and os.path.exists('/usr/bin/djvutxt'): from calibre.ptempfile import PersistentTemporaryFile try: fp = PersistentTemporaryFile(suffix='.djvu', prefix='djv_input') filename = fp._name fp.write(stream.read()) fp.close() cmd = ['djvutxt', filename] stdout.write(Popen(cmd, stdout=PIPE, close_fds=True).communicate()[0]) os.remove(filename) ppdjvu = False except: stream.seek(0) # retry with the pure python converter if ppdjvu: from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chm.metadata import get_metadata_from_reader from calibre.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, unicode_type): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False # options.no_images chm_name = stream.name # chm_data = stream.read() # closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s' % tdir) log.debug('stream.name=%s' % stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) try: metadata = get_metadata_from_reader(self._chm_reader) except Exception: log.exception('Failed to read metadata, using filename') from calibre.ebooks.metadata.book.base import Metadata metadata = Metadata(os.path.basename(chm_name)) encoding = self._chm_reader.get_encoding( ) or options.input_encoding or 'cp1252' self._chm_reader.CloseCHM() # print((tdir, mainpath)) # from calibre import ipython # ipython() options.debug_pipeline = None options.input_encoding = 'utf-8' uenc = encoding if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: uenc = 'utf-8' htmlpath, toc = self._create_html_root(mainpath, log, uenc) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi if toc.count() > 1: oeb.toc = self.parse_html_toc(oeb.spine[0]) oeb.manifest.remove(oeb.spine[0]) oeb.auto_generated_toc = False return oeb
def options_for_input_fmt(fmt): from calibre.customize.ui import plugin_for_input_format fmt = fmt.lower() plugin = plugin_for_input_format(fmt) if plugin is None: return None, () full_name = plugin.name.lower().replace(' ', '_') name = full_name.rpartition('_')[0] return full_name, OPTIONS['input'].get(name, ())
def opf_to_azw3(opf, outpath, log): from calibre.ebooks.conversion.plumber import Plumber, create_oebbook plumber = Plumber(opf, outpath, log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, outpath, inp, plumber.opts, log)
def do_rebuild(opf, dest_path): plumber = Plumber(opf, dest_path, default_log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(default_log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, dest_path, inp, plumber.opts, default_log)
def book_manifest(ctx, rd, book_id, fmt): db, library_id = get_library_data(ctx, rd)[:2] force_reload = rd.query.get('force_reload') == '1' if plugin_for_input_format(fmt) is None: raise HTTPNotFound('The format %s cannot be viewed' % fmt.upper()) if not ctx.has_id(rd, db, book_id): raise BookNotFound(book_id, db) with db.safe_read_lock: fm = db.format_metadata(book_id, fmt, allow_cache=False) if not fm: raise HTTPNotFound( 'No %s format for the book (id:%s) in the library: %s' % (fmt, book_id, library_id)) size, mtime = map( int, (fm['size'], time.mktime(fm['mtime'].utctimetuple()) * 10)) bhash = book_hash(db.library_id, book_id, fmt, size, mtime) with cache_lock: mpath = abspath( os.path.join(books_cache_dir(), 'f', bhash, 'calibre-book-manifest.json')) if force_reload: safe_remove(mpath, True) try: os.utime(mpath, None) with lopen(mpath, 'rb') as f: ans = jsonlib.load(f) ans['metadata'] = book_as_json(db, book_id) user = rd.username or None ans['last_read_positions'] = db.get_last_read_positions( book_id, fmt, user) if user else [] ans['annotations_map'] = db.annotations_map_for_book( book_id, fmt, user_type='web', user=user or '*') return ans except OSError as e: if e.errno != errno.ENOENT: raise x = failed_jobs.pop(bhash, None) if x is not None: return { 'aborted': x[0], 'traceback': x[1], 'job_status': 'finished' } job_id = queued_jobs.get(bhash) if job_id is None: job_id = queue_job(ctx, partial(db.copy_format_to, book_id, fmt), bhash, fmt, book_id, size, mtime) status, result, tb, aborted = ctx.job_status(job_id) return { 'aborted': aborted, 'traceback': tb, 'job_status': status, 'job_id': job_id }
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chm.metadata import get_metadata_from_reader from calibre.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, unicode): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False # options.no_images chm_name = stream.name # chm_data = stream.read() # closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s' % tdir) log.debug('stream.name=%s' % stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) try: metadata = get_metadata_from_reader(self._chm_reader) except Exception: log.exception('Failed to read metadata, using filename') from calibre.ebooks.metadata.book.base import Metadata metadata = Metadata(os.path.basename(chm_name)) encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252' self._chm_reader.CloseCHM() # print tdir, mainpath # from calibre import ipython # ipython() options.debug_pipeline = None options.input_encoding = 'utf-8' uenc = encoding if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: uenc = 'utf-8' htmlpath, toc = self._create_html_root(mainpath, log, uenc) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi if toc.count() > 1: oeb.toc = self.parse_html_toc(oeb.spine[0]) oeb.manifest.remove(oeb.spine[0]) oeb.auto_generated_toc = False return oeb
def commit(self, outpath=None, keep_parsed=False): super(AZW3Container, self).commit(keep_parsed=keep_parsed) if outpath is None: outpath = self.pathtoazw3 from calibre.ebooks.conversion.plumber import Plumber, create_oebbook opf = self.name_path_map[self.opf_name] plumber = Plumber(opf, outpath, self.log) plumber.setup_options() inp = plugin_for_input_format('azw3') outp = plugin_for_output_format('azw3') plumber.opts.mobi_passthrough = True oeb = create_oebbook(default_log, opf, plumber.opts) set_cover(oeb) outp.convert(oeb, outpath, inp, plumber.opts, default_log)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chm.metadata import get_metadata_from_reader from calibre.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, unicode): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False #options.no_images chm_name = stream.name #chm_data = stream.read() #closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s' % tdir) log.debug('stream.name=%s' % stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) metadata = get_metadata_from_reader(self._chm_reader) self._chm_reader.CloseCHM() #print tdir #from calibre import ipython #ipython() options.debug_pipeline = None options.input_encoding = 'utf-8' # try a custom conversion: #oeb = self._create_oebbook(mainpath, tdir, options, log, metadata) # try using html converter: htmlpath = self._create_html_root(mainpath, log) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi #log.debug('DEBUG: Not removing tempdir %s' % tdir) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = BytesIO() from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) raw_text = stdout.getvalue() if not raw_text: raise ValueError( 'The DJVU file contains no text, only images, probably page scans.' ' calibre only supports conversion of DJVU files with actual text in them.' ) html = convert_basic( raw_text.replace(b"\n", b' ').replace(b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwd() htmlfile = os.path.join(base, 'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = os.path.join(base, 'index%d.html' % c) with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def extract_content(self, output_dir): self.log.info('Extracting PDF...') pdf = PersistentTemporaryFile('.pdf') pdf.close() pdf = open(pdf, 'wb') for x in xrange(self.header.section_count()): pdf.write(self.header.section_data(x)) pdf.close() from calibre.customize.ui import plugin_for_input_format pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) return pdf_plugin.convert(open(pdf, 'rb'), self.options, 'pdf', self.log, {})
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.compression.tcr import decompress log.info('Decompressing text...') raw_txt = decompress(stream) log.info('Converting text to OEB...') stream = BytesIO(raw_txt) from calibre.customize.ui import plugin_for_input_format txt_plugin = plugin_for_input_format('txt') for opt in txt_plugin.options: if not hasattr(self.options, opt.option.name): setattr(options, opt.option.name, opt.recommended_value) stream.seek(0) return txt_plugin.convert(stream, options, 'txt', log, accelerators)
def do_print(): from calibre.customize.ui import plugin_for_input_format stdin = getattr(sys.stdin, 'buffer', sys.stdin) data = msgpack_loads(stdin.read()) ext = data['input'].lower().rpartition('.')[-1] input_plugin = plugin_for_input_format(ext) args = ['ebook-convert', data['input'], data['output'], '--paper-size', data['paper_size'], '--pdf-add-toc', '--disable-remove-fake-margins', '--chapter-mark', 'none', '-vv'] if input_plugin.is_image_collection: args.append('--no-process') else: args.append('--disable-font-rescaling') args.append('--page-breaks-before=/') if data['page_numbers']: args.append('--pdf-page-numbers') for edge in 'left top right bottom'.split(): args.append('--pdf-page-margin-' + edge), args.append('%.1f' % (data['margin_' + edge] * 72)) from calibre.ebooks.conversion.cli import main main(args)
def extract_content(self, output_dir): raw_txt = b'' self.log.info('Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug('\tDecompressing text section %i' % i) raw_txt += self.decompress_text(i) self.log.info('Converting text to OEB...') stream = io.BytesIO(raw_txt) from calibre.customize.ui import plugin_for_input_format txt_plugin = plugin_for_input_format('txt') for opt in txt_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) stream.seek(0) return txt_plugin.convert(stream, self.options, 'txt', self.log, {})
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.txt.processor import convert_basic stdout = BytesIO() from calibre.ebooks.djvu.djvu import DJVUFile x = DJVUFile(stream) x.get_text(stdout) html = convert_basic(stdout.getvalue().replace(b"\n", b' ').replace( b'\037', b'\n\n')) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = os.path.join(base, 'index%d.html'%c) htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile.name, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) return oeb
def extract_content(self, output_dir): self.log.info('Extracting PDF from AZW4 Container...') self.stream.seek(0) raw_data = self.stream.read() data = '' mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL) if mo: data = mo.group() pdf_n = os.path.join(os.getcwdu(), 'tmp.pdf') with open(pdf_n, 'wb') as pdf: pdf.write(data) from calibre.customize.ui import plugin_for_input_format pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) return pdf_plugin.convert(open(pdf_n, 'rb'), self.options, 'pdf', self.log, {})
def extract_content(self, output_dir): self.log.info('Extracting PDF from AZW4 Container...') self.stream.seek(0) raw_data = self.stream.read() data = b'' mo = re.search(br'%PDF.+%%EOF', raw_data, flags=re.DOTALL) if mo: data = mo.group() pdf_n = os.path.join(os.getcwd(), 'tmp.pdf') with open(pdf_n, 'wb') as pdf: pdf.write(data) from calibre.customize.ui import plugin_for_input_format pdf_plugin = plugin_for_input_format('pdf') for opt in pdf_plugin.options: if not hasattr(self.options, opt.option.name): setattr(self.options, opt.option.name, opt.recommended_value) return pdf_plugin.convert(open(pdf_n, 'rb'), self.options, 'pdf', self.log, {})
def convert_vtt_files(self): main_lang = self.main_lang_combo.currentData() if main_lang == '-': QMessageBox.about(self, 'Information', 'Select the main language before conversion.') return sub_lang = str(self.sub_lang_combo.currentData()) # if user does not select sub_lang, set it to main_lang, so that when converting, it won't generate sub language. if sub_lang == '-': sub_lang = main_lang # convert to html if hasattr(self, 'cover_file_path'): cover_file_path = self.cover_file_path else: cover_file_path = None self.book_id = self.convert_to_html_add_to_library( self.vtt_dir, main_lang, sub_lang, cover_file_path) # add html to epub conversion job self.jobs, changed, bad = convert_single_ebook( self.gui, self.gui.library_view.model().db, [self.book_id], True, self.outputFmt) func, args, desc, fmt, id, temp_files = self.jobs[0] core_usage = 1 plugin = plugin_for_input_format('html') if plugin is not None: core_usage = plugin.core_usage self.gui.job_manager.run_job(Dispatcher(self.converted_func), func, args=args, description=desc, core_usage=core_usage) self.close()
def __init__(self, input, output, log, report_progress=DummyReporter(), dummy=False, merge_plugin_recs=True, abort_after_input_dump=False, override_input_metadata=False): ''' :param input: Path to input file. :param output: Path to output file/directory ''' if isbytestring(input): input = input.decode(filesystem_encoding) if isbytestring(output): output = output.decode(filesystem_encoding) self.original_input_arg = input self.input = os.path.abspath(input) self.output = os.path.abspath(output) self.log = log self.ui_reporter = report_progress self.abort_after_input_dump = abort_after_input_dump self.override_input_metadata = override_input_metadata # Pipeline options {{{ # Initialize the conversion options that are independent of input and # output formats. The input and output plugins can still disable these # options via recommendations. self.pipeline_options = [ OptionRecommendation(name='verbose', recommended_value=0, level=OptionRecommendation.LOW, short_switch='v', help=_('Level of verbosity. Specify multiple times for greater ' 'verbosity.') ), OptionRecommendation(name='debug_pipeline', recommended_value=None, level=OptionRecommendation.LOW, short_switch='d', help=_('Save the output from different stages of the conversion ' 'pipeline to the specified ' 'directory. Useful if you are unsure at which stage ' 'of the conversion process a bug is occurring.') ), OptionRecommendation(name='input_profile', recommended_value='default', level=OptionRecommendation.LOW, choices=[x.short_name for x in input_profiles()], help=_('Specify the input profile. The input profile gives the ' 'conversion system information on how to interpret ' 'various information in the input document. For ' 'example resolution dependent lengths (i.e. lengths in ' 'pixels). Choices are:')+\ ', '.join([x.short_name for x in input_profiles()]) ), OptionRecommendation(name='output_profile', recommended_value='default', level=OptionRecommendation.LOW, choices=[x.short_name for x in output_profiles()], help=_('Specify the output profile. The output profile ' 'tells the conversion system how to optimize the ' 'created document for the specified device. In some cases, ' 'an output profile is required to produce documents that ' 'will work on a device. For example EPUB on the SONY reader. ' 'Choices are:') + \ ', '.join([x.short_name for x in output_profiles()]) ), OptionRecommendation(name='base_font_size', recommended_value=0, level=OptionRecommendation.LOW, help=_('The base font size in pts. All font sizes in the produced book ' 'will be rescaled based on this size. By choosing a larger ' 'size you can make the fonts in the output bigger and vice ' 'versa. By default, the base font size is chosen based on ' 'the output profile you chose.' ) ), OptionRecommendation(name='font_size_mapping', recommended_value=None, level=OptionRecommendation.LOW, help=_('Mapping from CSS font names to font sizes in pts. ' 'An example setting is 12,12,14,16,18,20,22,24. ' 'These are the mappings for the sizes xx-small to xx-large, ' 'with the final size being for huge fonts. The font ' 'rescaling algorithm uses these sizes to intelligently ' 'rescale fonts. The default is to use a mapping based on ' 'the output profile you chose.' ) ), OptionRecommendation(name='disable_font_rescaling', recommended_value=False, level=OptionRecommendation.LOW, help=_('Disable all rescaling of font sizes.' ) ), OptionRecommendation(name='minimum_line_height', recommended_value=120.0, level=OptionRecommendation.LOW, help=_( 'The minimum line height, as a percentage of the element\'s ' 'calculated font size. calibre will ensure that every element ' 'has a line height of at least this setting, irrespective of ' 'what the input document specifies. Set to zero to disable. ' 'Default is 120%. Use this setting in preference to ' 'the direct line height specification, unless you know what ' 'you are doing. For example, you can achieve "double spaced" ' 'text by setting this to 240.' ) ), OptionRecommendation(name='line_height', recommended_value=0, level=OptionRecommendation.LOW, help=_( 'The line height in pts. Controls spacing between consecutive ' 'lines of text. Only applies to elements that do not define ' 'their own line height. In most cases, the minimum line height ' 'option is more useful. ' 'By default no line height manipulation is performed.' ) ), OptionRecommendation(name='linearize_tables', recommended_value=False, level=OptionRecommendation.LOW, help=_('Some badly designed documents use tables to control the ' 'layout of text on the page. When converted these documents ' 'often have text that runs off the page and other artifacts. ' 'This option will extract the content from the tables and ' 'present it in a linear fashion.' ) ), OptionRecommendation(name='level1_toc', recommended_value=None, level=OptionRecommendation.LOW, help=_('XPath expression that specifies all tags that ' 'should be added to the Table of Contents at level one. If ' 'this is specified, it takes precedence over other forms ' 'of auto-detection.' ' See the XPath Tutorial in the calibre User Manual for examples.' ) ), OptionRecommendation(name='level2_toc', recommended_value=None, level=OptionRecommendation.LOW, help=_('XPath expression that specifies all tags that should be ' 'added to the Table of Contents at level two. Each entry is added ' 'under the previous level one entry.' ' See the XPath Tutorial in the calibre User Manual for examples.' ) ), OptionRecommendation(name='level3_toc', recommended_value=None, level=OptionRecommendation.LOW, help=_('XPath expression that specifies all tags that should be ' 'added to the Table of Contents at level three. Each entry ' 'is added under the previous level two entry.' ' See the XPath Tutorial in the calibre User Manual for examples.' ) ), OptionRecommendation(name='use_auto_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('Normally, if the source file already has a Table of ' 'Contents, it is used in preference to the auto-generated one. ' 'With this option, the auto-generated one is always used.' ) ), OptionRecommendation(name='no_chapters_in_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_("Don't add auto-detected chapters to the Table of " 'Contents.' ) ), OptionRecommendation(name='toc_threshold', recommended_value=6, level=OptionRecommendation.LOW, help=_( 'If fewer than this number of chapters is detected, then links ' 'are added to the Table of Contents. Default: %default') ), OptionRecommendation(name='max_toc_links', recommended_value=50, level=OptionRecommendation.LOW, help=_('Maximum number of links to insert into the TOC. Set to 0 ' 'to disable. Default is: %default. Links are only added to the ' 'TOC if less than the threshold number of chapters were detected.' ) ), OptionRecommendation(name='toc_filter', recommended_value=None, level=OptionRecommendation.LOW, help=_('Remove entries from the Table of Contents whose titles ' 'match the specified regular expression. Matching entries and all ' 'their children are removed.' ) ), OptionRecommendation(name='duplicate_links_in_toc', recommended_value=False, level=OptionRecommendation.LOW, help=_('When creating a TOC from links in the input document, ' 'allow duplicate entries, i.e. allow more than one entry ' 'with the same text, provided that they point to a ' 'different location.') ), OptionRecommendation(name='chapter', recommended_value="//*[((name()='h1' or name()='h2') and " r"re:test(., '\s*((chapter|book|section|part)\s+)|((prolog|prologue|epilogue)(\s+|$))', 'i')) or @class " "= 'chapter']", level=OptionRecommendation.LOW, help=_('An XPath expression to detect chapter titles. The default ' 'is to consider <h1> or <h2> tags that contain the words ' '"chapter","book","section", "prologue", "epilogue", or "part" as chapter titles as ' 'well as any tags that have class="chapter". The expression ' 'used must evaluate to a list of elements. To disable chapter ' 'detection, use the expression "/". See the XPath Tutorial ' 'in the calibre User Manual for further help on using this ' 'feature.' ) ), OptionRecommendation(name='chapter_mark', recommended_value='pagebreak', level=OptionRecommendation.LOW, choices=['pagebreak', 'rule', 'both', 'none'], help=_('Specify how to mark detected chapters. A value of ' '"pagebreak" will insert page breaks before chapters. ' 'A value of "rule" will insert a line before chapters. ' 'A value of "none" will disable chapter marking and a ' 'value of "both" will use both page breaks and lines ' 'to mark chapters.') ), OptionRecommendation(name='extra_css', recommended_value=None, level=OptionRecommendation.LOW, help=_('Either the path to a CSS stylesheet or raw CSS. ' 'This CSS will be appended to the style rules from ' 'the source file, so it can be used to override those ' 'rules.') ), OptionRecommendation(name='filter_css', recommended_value=None, level=OptionRecommendation.LOW, help=_('A comma separated list of CSS properties that ' 'will be removed from all CSS style rules. This is useful ' 'if the presence of some style information prevents it ' 'from being overridden on your device. ' 'For example: ' 'font-family,color,margin-left,margin-right') ), OptionRecommendation(name='page_breaks_before', recommended_value="//*[name()='h1' or name()='h2']", level=OptionRecommendation.LOW, help=_('An XPath expression. Page breaks are inserted ' 'before the specified elements.') ), OptionRecommendation(name='remove_fake_margins', recommended_value=True, level=OptionRecommendation.LOW, help=_('Some documents specify page margins by ' 'specifying a left and right margin on each individual ' 'paragraph. calibre will try to detect and remove these ' 'margins. Sometimes, this can cause the removal of ' 'margins that should not have been removed. In this ' 'case you can disable the removal.') ), OptionRecommendation(name='margin_top', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the top margin in pts. Default is %default. ' 'Setting this to less than zero will cause no margin to be set. ' 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_bottom', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the bottom margin in pts. Default is %default. ' 'Setting this to less than zero will cause no margin to be set. ' 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_left', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the left margin in pts. Default is %default. ' 'Setting this to less than zero will cause no margin to be set. ' 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='margin_right', recommended_value=5.0, level=OptionRecommendation.LOW, help=_('Set the right margin in pts. Default is %default. ' 'Setting this to less than zero will cause no margin to be set. ' 'Note: 72 pts equals 1 inch')), OptionRecommendation(name='change_justification', recommended_value='original', level=OptionRecommendation.LOW, choices=['left','justify','original'], help=_('Change text justification. A value of "left" converts all' ' justified text in the source to left aligned (i.e. ' 'unjustified) text. A value of "justify" converts all ' 'unjustified text to justified. A value of "original" ' '(the default) does not change justification in the ' 'source file. Note that only some output formats support ' 'justification.')), OptionRecommendation(name='remove_paragraph_spacing', recommended_value=False, level=OptionRecommendation.LOW, help=_('Remove spacing between paragraphs. Also sets an indent on ' 'paragraphs of 1.5em. Spacing removal will not work ' 'if the source file does not use paragraphs (<p> or <div> tags).') ), OptionRecommendation(name='remove_paragraph_spacing_indent_size', recommended_value=1.5, level=OptionRecommendation.LOW, help=_('When calibre removes blank lines between paragraphs, it automatically ' 'sets a paragraph indent, to ensure that paragraphs can be easily ' 'distinguished. This option controls the width of that indent (in em). ' 'If you set this value negative, then the indent specified in the input ' 'document is used, that is, calibre does not change the indentation.') ), OptionRecommendation(name='prefer_metadata_cover', recommended_value=False, level=OptionRecommendation.LOW, help=_('Use the cover detected from the source file in preference ' 'to the specified cover.') ), OptionRecommendation(name='insert_blank_line', recommended_value=False, level=OptionRecommendation.LOW, help=_('Insert a blank line between paragraphs. Will not work ' 'if the source file does not use paragraphs (<p> or <div> tags).' ) ), OptionRecommendation(name='insert_blank_line_size', recommended_value=0.5, level=OptionRecommendation.LOW, help=_('Set the height of the inserted blank lines (in em).' ' The height of the lines between paragraphs will be twice the value' ' set here.') ), OptionRecommendation(name='remove_first_image', recommended_value=False, level=OptionRecommendation.LOW, help=_('Remove the first image from the input ebook. Useful if the ' 'input document has a cover image that is not identified as a cover. ' 'In this case, if you set a cover in calibre, the output document will ' 'end up with two cover images if you do not specify this option.' ) ), OptionRecommendation(name='insert_metadata', recommended_value=False, level=OptionRecommendation.LOW, help=_('Insert the book metadata at the start of ' 'the book. This is useful if your ebook reader does not support ' 'displaying/searching metadata directly.' ) ), OptionRecommendation(name='smarten_punctuation', recommended_value=False, level=OptionRecommendation.LOW, help=_('Convert plain quotes, dashes and ellipsis to their ' 'typographically correct equivalents. For details, see ' 'http://daringfireball.net/projects/smartypants' ) ), OptionRecommendation(name='unsmarten_punctuation', recommended_value=False, level=OptionRecommendation.LOW, help=_('Convert fancy quotes, dashes and ellipsis to their ' 'plain equivalents.' ) ), OptionRecommendation(name='read_metadata_from_opf', recommended_value=None, level=OptionRecommendation.LOW, short_switch='m', help=_('Read metadata from the specified OPF file. Metadata read ' 'from this file will override any metadata in the source ' 'file.') ), OptionRecommendation(name='asciiize', recommended_value=False, level=OptionRecommendation.LOW, help=(_('Transliterate unicode characters to an ASCII ' 'representation. Use with care because this will replace ' 'unicode characters with ASCII. For instance it will replace "%s" ' 'with "Mikhail Gorbachiov". Also, note that in ' 'cases where there are multiple representations of a character ' '(characters shared by Chinese and Japanese for instance) the ' 'representation based on the current calibre interface language will be ' 'used.')%\ u'\u041c\u0438\u0445\u0430\u0438\u043b ' u'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432' ) ), OptionRecommendation(name='keep_ligatures', recommended_value=False, level=OptionRecommendation.LOW, help=_('Preserve ligatures present in the input document. ' 'A ligature is a special rendering of a pair of ' 'characters like ff, fi, fl et cetera. ' 'Most readers do not have support for ' 'ligatures in their default fonts, so they are ' 'unlikely to render correctly. By default, calibre ' 'will turn a ligature into the corresponding pair of normal ' 'characters. This option will preserve them instead.') ), OptionRecommendation(name='title', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the title.')), OptionRecommendation(name='authors', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the authors. Multiple authors should be separated by ' 'ampersands.')), OptionRecommendation(name='title_sort', recommended_value=None, level=OptionRecommendation.LOW, help=_('The version of the title to be used for sorting. ')), OptionRecommendation(name='author_sort', recommended_value=None, level=OptionRecommendation.LOW, help=_('String to be used when sorting by author. ')), OptionRecommendation(name='cover', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the cover to the specified file or URL')), OptionRecommendation(name='comments', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the ebook description.')), OptionRecommendation(name='publisher', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the ebook publisher.')), OptionRecommendation(name='series', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the series this ebook belongs to.')), OptionRecommendation(name='series_index', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the index of the book in this series.')), OptionRecommendation(name='rating', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the rating. Should be a number between 1 and 5.')), OptionRecommendation(name='isbn', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the ISBN of the book.')), OptionRecommendation(name='tags', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the tags for the book. Should be a comma separated list.')), OptionRecommendation(name='book_producer', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the book producer.')), OptionRecommendation(name='language', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the language.')), OptionRecommendation(name='pubdate', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the publication date.')), OptionRecommendation(name='timestamp', recommended_value=None, level=OptionRecommendation.LOW, help=_('Set the book timestamp (no longer used anywhere)')), OptionRecommendation(name='enable_heuristics', recommended_value=False, level=OptionRecommendation.LOW, help=_('Enable heuristic processing. This option must be set for any ' 'heuristic processing to take place.')), OptionRecommendation(name='markup_chapter_headings', recommended_value=True, level=OptionRecommendation.LOW, help=_('Detect unformatted chapter headings and sub headings. Change ' 'them to h2 and h3 tags. This setting will not create a TOC, ' 'but can be used in conjunction with structure detection to create ' 'one.')), OptionRecommendation(name='italicize_common_cases', recommended_value=True, level=OptionRecommendation.LOW, help=_('Look for common words and patterns that denote ' 'italics and italicize them.')), OptionRecommendation(name='fix_indents', recommended_value=True, level=OptionRecommendation.LOW, help=_('Turn indentation created from multiple non-breaking space entities ' 'into CSS indents.')), OptionRecommendation(name='html_unwrap_factor', recommended_value=0.40, level=OptionRecommendation.LOW, help=_('Scale used to determine the length at which a line should ' 'be unwrapped. Valid values are a decimal between 0 and 1. The ' 'default is 0.4, just below the median line length. If only a ' 'few lines in the document require unwrapping this value should ' 'be reduced')), OptionRecommendation(name='unwrap_lines', recommended_value=True, level=OptionRecommendation.LOW, help=_('Unwrap lines using punctuation and other formatting clues.')), OptionRecommendation(name='delete_blank_paragraphs', recommended_value=True, level=OptionRecommendation.LOW, help=_('Remove empty paragraphs from the document when they exist between ' 'every other paragraph')), OptionRecommendation(name='format_scene_breaks', recommended_value=True, level=OptionRecommendation.LOW, help=_('Left aligned scene break markers are center aligned. ' 'Replace soft scene breaks that use multiple blank lines with ' 'horizontal rules.')), OptionRecommendation(name='replace_scene_breaks', recommended_value='', level=OptionRecommendation.LOW, help=_('Replace scene breaks with the specified text. By default, the ' 'text from the input document is used.')), OptionRecommendation(name='dehyphenate', recommended_value=True, level=OptionRecommendation.LOW, help=_('Analyze hyphenated words throughout the document. The ' 'document itself is used as a dictionary to determine whether hyphens ' 'should be retained or removed.')), OptionRecommendation(name='renumber_headings', recommended_value=True, level=OptionRecommendation.LOW, help=_('Looks for occurrences of sequential <h1> or <h2> tags. ' 'The tags are renumbered to prevent splitting in the middle ' 'of chapter headings.')), OptionRecommendation(name='sr1_search', recommended_value='', level=OptionRecommendation.LOW, help=_('Search pattern (regular expression) to be replaced with ' 'sr1-replace.')), OptionRecommendation(name='sr1_replace', recommended_value='', level=OptionRecommendation.LOW, help=_('Replacement to replace the text found with sr1-search.')), OptionRecommendation(name='sr2_search', recommended_value='', level=OptionRecommendation.LOW, help=_('Search pattern (regular expression) to be replaced with ' 'sr2-replace.')), OptionRecommendation(name='sr2_replace', recommended_value='', level=OptionRecommendation.LOW, help=_('Replacement to replace the text found with sr2-search.')), OptionRecommendation(name='sr3_search', recommended_value='', level=OptionRecommendation.LOW, help=_('Search pattern (regular expression) to be replaced with ' 'sr3-replace.')), OptionRecommendation(name='sr3_replace', recommended_value='', level=OptionRecommendation.LOW, help=_('Replacement to replace the text found with sr3-search.')), OptionRecommendation(name='search_replace', recommended_value=None, level=OptionRecommendation.LOW, help=_( 'Path to a file containing search and replace regular expressions. ' 'The file must contain alternating lines of regular expression ' 'followed by replacement pattern (which can be an empty line). ' 'The regular expression must be in the python regex syntax and ' 'the file must be UTF-8 encoded.')), ] # }}} input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower().replace('original_', '') self.archive_input_tdir = None if input_fmt in ARCHIVE_FMTS: self.log('Processing archive...') tdir = PersistentTemporaryDirectory('_plumber_archive') self.input, input_fmt = self.unarchive(self.input, tdir) self.archive_input_tdir = tdir if os.access(self.input, os.R_OK): nfp = run_plugins_on_preprocess(self.input, input_fmt) if nfp != self.input: self.input = nfp input_fmt = os.path.splitext(self.input)[1] if not input_fmt: raise ValueError('Input file must have an extension') input_fmt = input_fmt[1:].lower() if os.path.exists(self.output) and os.path.isdir(self.output): output_fmt = 'oeb' else: output_fmt = os.path.splitext(self.output)[1] if not output_fmt: output_fmt = '.oeb' output_fmt = output_fmt[1:].lower() self.input_plugin = plugin_for_input_format(input_fmt) self.output_plugin = plugin_for_output_format(output_fmt) if self.input_plugin is None: raise ValueError('No plugin to handle input format: '+input_fmt) if self.output_plugin is None: raise ValueError('No plugin to handle output format: '+output_fmt) self.input_fmt = input_fmt self.output_fmt = output_fmt self.all_format_options = set() self.input_options = set() self.output_options = set() # Build set of all possible options. Two options are equal if their # names are the same. if not dummy: self.input_options = self.input_plugin.options.union( self.input_plugin.common_options) self.output_options = self.output_plugin.options.union( self.output_plugin.common_options) else: for fmt in available_input_formats(): input_plugin = plugin_for_input_format(fmt) if input_plugin: self.all_format_options = self.all_format_options.union( input_plugin.options.union(input_plugin.common_options)) for fmt in available_output_formats(): output_plugin = plugin_for_output_format(fmt) if output_plugin: self.all_format_options = self.all_format_options.union( output_plugin.options.union(output_plugin.common_options)) # Remove the options that have been disabled by recommendations from the # plugins. for w in ('input_options', 'output_options', 'all_format_options'): temp = set([]) for x in getattr(self, w): temp.add(x.clone()) setattr(self, w, temp) if merge_plugin_recs: self.merge_plugin_recommendations()
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s' % options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug( 'Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def __init__(self, book_fmt, opfpath, input_fmt, tdir, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None, allow_no_cover=True, virtualize_resources=True): log = log or default_log self.allow_no_cover = allow_no_cover ContainerBase.__init__(self, tdir, opfpath, log) self.book_metadata = book_metadata input_plugin = plugin_for_input_format(input_fmt) self.is_comic = bool( getattr(input_plugin, 'is_image_collection', False)) if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' self.bookmark_data = None if self.exists(bm_file): with self.open(bm_file, 'rb') as f: self.bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(self.mime_map) if name == self.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not self.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = self.create_cover_page( input_fmt.lower()) toc = get_toc(self).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(self, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in self.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(self) if l['dest'] in spineq] self.book_render_data = data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': self.is_comic, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } # Mark the spine as dirty since we have to ensure it is normalized for name in data['spine']: self.parsed(name), self.dirty(name) self.virtualized_names = set() self.transform_all(virtualize_resources) def manifest_data(name): mt = (self.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(self.name_path_map[name]), 'is_virtualized': name in self.virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: root = self.parsed(name) ans['length'] = l = get_length(root) self.book_render_data['total_length'] += l if name in data['spine']: self.book_render_data['spine_length'] += l ans['has_maths'] = hm = check_for_maths(root) if hm: self.book_render_data['has_maths'] = True ans['anchor_map'] = anchor_map(root) return ans data['files'] = { name: manifest_data(name) for name in set(self.name_path_map) - excluded_names } self.commit() for name in excluded_names: os.remove(self.name_path_map[name]) data = json.dumps(self.book_render_data, ensure_ascii=False) if not isinstance(data, bytes): data = data.encode('utf-8') with lopen(os.path.join(self.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import (convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s'%options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence'] if det_encoding and det_encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug('No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug('Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt,'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()]) except RuntimeError: raise ValueError('This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax') elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() if file_ext != 'txtz' and hasattr(stream, 'name'): base = os.path.dirname(stream.name) fname = os.path.join(base, 'index.html') c = 0 while os.path.exists(fname): c += 1 fname = 'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml(section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml(d, section_data.header.paragraph_offsets).decode(self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug('Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error('Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data(lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug('Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error('Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = os.getcwdu() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html'%c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=os.getcwdu()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(os.getcwdu(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn( _('Multiple HTML files found in the archive. Only %s will be used.' ) % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def extract_content(self, output_dir): # Each text record is independent (unless the continuation # value is set in the previous record). Put each converted # text recored into a separate file. We will reference the # home.html file as the first file and let the HTML input # plugin assemble the order based on hyperlinks. with CurrentDir(output_dir): for uid, num in self.uid_text_secion_number.items(): self.log.debug('Writing record with uid: %s as %s.html' % (uid, uid)) with open('%s.html' % uid, 'wb') as htmlf: html = u'<html><body>' section_header, section_data = self.sections[num] if section_header.type == DATATYPE_PHTML: html += self.process_phtml( section_data.data, section_data.header.paragraph_offsets) elif section_header.type == DATATYPE_PHTML_COMPRESSED: d = self.decompress_phtml(section_data.data) html += self.process_phtml( d, section_data.header.paragraph_offsets).decode( self.get_text_uid_encoding(section_header.uid), 'replace') html += '</body></html>' htmlf.write(html.encode('utf-8')) # Images. # Cache the image sizes in case they are used by a composite image. images = set() if not os.path.exists(os.path.join(output_dir, 'images/')): os.makedirs(os.path.join(output_dir, 'images/')) with CurrentDir(os.path.join(output_dir, 'images/')): # Single images. for uid, num in self.uid_image_section_number.items(): section_header, section_data = self.sections[num] if section_data: idata = None if section_header.type == DATATYPE_TBMP: idata = section_data elif section_header.type == DATATYPE_TBMP_COMPRESSED: if self.header_record.compression == 1: idata = decompress_doc(section_data) elif self.header_record.compression == 2: idata = zlib.decompress(section_data) try: save_cover_data_to(idata, '%s.jpg' % uid, compression_quality=70) images.add(uid) self.log.debug( 'Wrote image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write image with uid %s: %s' % (uid, e)) else: self.log.error( 'Failed to write image with uid %s: No data.' % uid) # Composite images. # We're going to use the already compressed .jpg images here. for uid, num in self.uid_composite_image_section_number.items(): try: section_header, section_data = self.sections[num] # Get the final width and height. width = 0 height = 0 for row in section_data.layout: row_width = 0 col_height = 0 for col in row: if col not in images: raise Exception('Image with uid: %s missing.' % col) w, h = identify(lopen('%s.jpg' % col, 'rb'))[1:] row_width += w if col_height < h: col_height = h if width < row_width: width = row_width height += col_height # Create a new image the total size of all image # parts. Put the parts into the new image. with Canvas(width, height) as canvas: y_off = 0 for row in section_data.layout: x_off = 0 largest_height = 0 for col in row: im = image_from_data( lopen('%s.jpg' % col, 'rb').read()) canvas.compose(im, x_off, y_off) w, h = im.width(), im.height() x_off += w if largest_height < h: largest_height = h y_off += largest_height with lopen('%s.jpg' % uid) as out: out.write(canvas.export(compression_quality=70)) self.log.debug( 'Wrote composite image with uid %s to images/%s.jpg' % (uid, uid)) except Exception as e: self.log.error( 'Failed to write composite image with uid %s: %s' % (uid, e)) # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(self.options, opt.option.name, opt.recommended_value) self.options.input_encoding = 'utf-8' odi = self.options.debug_pipeline self.options.debug_pipeline = None # Determine the home.html record uid. This should be set in the # reserved values in the metadata recored. home.html is the first # text record (should have hyper link references to other records) # in the document. try: home_html = self.header_record.home_html if not home_html: home_html = self.uid_text_secion_number.items()[0][0] except: raise Exception('Could not determine home.html') # Generate oeb from html conversion. oeb = html_input.convert(open('%s.html' % home_html, 'rb'), self.options, 'html', self.log, {}) self.options.debug_pipeline = odi return oeb
def process_exploded_book(book_fmt, opfpath, input_fmt, tdir, render_manager, log=None, book_hash=None, save_bookmark_data=False, book_metadata=None, virtualize_resources=True): log = log or default_log container = SimpleContainer(tdir, opfpath, log) input_plugin = plugin_for_input_format(input_fmt) is_comic = bool(getattr(input_plugin, 'is_image_collection', False)) def needs_work(mt): return mt in OEB_STYLES or mt in OEB_DOCS or mt == 'image/svg+xml' def work_priority(name): # ensure workers with large files or stylesheets # have the less names size = os.path.getsize(container.name_path_map[name]), is_html = container.mime_map.get(name) in OEB_DOCS return (0 if is_html else 1), size if not is_comic: render_manager.launch_workers( tuple(n for n, mt in iteritems(container.mime_map) if needs_work(mt)), container) bookmark_data = None if save_bookmark_data: bm_file = 'META-INF/calibre_bookmarks.txt' if container.exists(bm_file): with container.open(bm_file, 'rb') as f: bookmark_data = f.read() # We do not add zero byte sized files as the IndexedDB API in the # browser has no good way to distinguish between zero byte files and # load failures. excluded_names = { name for name, mt in iteritems(container.mime_map) if name == container.opf_name or mt == guess_type('a.ncx') or name.startswith('META-INF/') or name == 'mimetype' or not container.has_name_and_is_not_empty(name) } raster_cover_name, titlepage_name = create_cover_page( container, input_fmt.lower(), is_comic, book_metadata) toc = get_toc(container, verify_destinations=False).to_dict(count()) if not toc or not toc.get('children'): toc = from_xpaths(container, ['//h:h1', '//h:h2', '//h:h3']).to_dict(count()) spine = [name for name, is_linear in container.spine_names] spineq = frozenset(spine) landmarks = [l for l in get_landmarks(container) if l['dest'] in spineq] book_render_data = { 'version': RENDER_VERSION, 'toc': toc, 'book_format': book_fmt, 'spine': spine, 'link_uid': uuid4(), 'book_hash': book_hash, 'is_comic': is_comic, 'raster_cover_name': raster_cover_name, 'title_page_name': titlepage_name, 'has_maths': False, 'total_length': 0, 'spine_length': 0, 'toc_anchor_map': toc_anchor_map(toc), 'landmarks': landmarks, 'link_to_map': {}, } names = sorted( (n for n, mt in iteritems(container.mime_map) if needs_work(mt)), key=work_priority) results = render_manager( names, (tdir, opfpath, virtualize_resources, book_render_data['link_uid'], container.data_for_clone()), container) ltm = book_render_data['link_to_map'] html_data = {} virtualized_names = set() def merge_ltm(dest, src): for k, v in iteritems(src): if k in dest: dest[k] |= v else: dest[k] = v for link_to_map, hdata, vnames in results: html_data.update(hdata) virtualized_names |= vnames for k, v in iteritems(link_to_map): if k in ltm: merge_ltm(ltm[k], v) else: ltm[k] = v def manifest_data(name): mt = (container.mime_map.get(name) or 'application/octet-stream').lower() ans = { 'size': os.path.getsize(container.name_path_map[name]), 'is_virtualized': name in virtualized_names, 'mimetype': mt, 'is_html': mt in OEB_DOCS, } if ans['is_html']: data = html_data[name] ans['length'] = l = data['length'] book_render_data['total_length'] += l if name in book_render_data['spine']: book_render_data['spine_length'] += l ans['has_maths'] = hm = data['has_maths'] if hm: book_render_data['has_maths'] = True ans['anchor_map'] = data['anchor_map'] return ans book_render_data['files'] = { name: manifest_data(name) for name in set(container.name_path_map) - excluded_names } container.commit() for name in excluded_names: os.remove(container.name_path_map[name]) ltm = book_render_data['link_to_map'] for name, amap in iteritems(ltm): for k, v in tuple(iteritems(amap)): amap[k] = tuple(v) # needed for JSON serialization data = as_bytes(json.dumps(book_render_data, ensure_ascii=False)) with lopen(os.path.join(container.root, 'calibre-book-manifest.json'), 'wb') as f: f.write(data) return container, bookmark_data
def create_cover_page(self, input_fmt): templ = ''' <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en"> <head><style> html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; } img { width: 100%%; height: 100%%; object-fit: contain; margin-left: auto; margin-right: auto; max-width: 100vw; max-height: 100vh; top: 50vh; transform: translateY(-50%%); position: relative; } body.cover-fill img { object-fit: fill; } </style></head><body><img src="%s"/></body></html> ''' def generic_cover(): if self.book_metadata is not None: from calibre.ebooks.covers import create_cover mi = self.book_metadata return create_cover(mi.title, mi.authors, mi.series, mi.series_index) return BLANK_JPEG if input_fmt == 'epub': def image_callback(cover_image, wrapped_image): if cover_image: image_callback.cover_data = self.raw_data(cover_image, decode=False) if wrapped_image and not getattr(image_callback, 'cover_data', None): image_callback.cover_data = self.raw_data(wrapped_image, decode=False) def cover_path(action, data): if action == 'write_image': cdata = getattr(image_callback, 'cover_data', None) or generic_cover() data.write(cdata) if self.allow_no_cover and not has_epub_cover(self): return None, None raster_cover_name, titlepage_name = set_epub_cover( self, cover_path, (lambda *a: None), options={'template': templ}, image_callback=image_callback) else: raster_cover_name = find_cover_image(self, strict=True) if raster_cover_name is None: if self.allow_no_cover: return None, None item = self.generate_item(name='cover.jpeg', id_prefix='cover') raster_cover_name = self.href_to_name(item.get('href'), self.opf_name) with self.open(raster_cover_name, 'wb') as dest: dest.write(generic_cover()) input_plugin = plugin_for_input_format(input_fmt) if getattr(input_plugin, 'is_image_collection', False): return raster_cover_name, None item = self.generate_item(name='titlepage.html', id_prefix='titlepage') titlepage_name = self.href_to_name(item.get('href'), self.opf_name) raw = templ % prepare_string_for_xml( self.name_to_href(raster_cover_name, titlepage_name), True) with self.open(titlepage_name, 'wb') as f: f.write(raw.encode('utf-8')) spine = self.opf_xpath('//opf:spine')[0] ref = spine.makeelement(OPF('itemref'), idref=item.get('id')) self.insert_into_xml(spine, ref, index=0) self.dirty(self.opf_name) return raster_cover_name, titlepage_name