def tweak(ebook_file): ''' Command line interface to the Tweak Book tool ''' fmt = ebook_file.rpartition('.')[-1].lower() exploder, rebuilder = get_tools(fmt) if exploder is None: prints('Cannot tweak %s files. Supported formats are: EPUB, HTMLZ, AZW3, MOBI' , file=sys.stderr) raise SystemExit(1) with TemporaryDirectory('_tweak_'+ os.path.basename(ebook_file).rpartition('.')[0]) as tdir: try: opf = exploder(ebook_file, tdir, question=ask_cli_question) except WorkerError as e: prints('Failed to unpack', ebook_file) prints(e.orig_tb) raise SystemExit(1) except Error as e: prints(as_unicode(e), file=sys.stderr) raise SystemExit(1) if opf is None: # The question was answered with No return ed = os.environ.get('EDITOR', 'dummy') cmd = shlex.split(ed) isvim = bool([x for x in cmd[0].split('/') if x.endswith('vim')]) proceed = False prints('Book extracted to', tdir) if not isvim: prints('Make your tweaks and once you are done,', __appname__, 'will rebuild', ebook_file, 'from', tdir) print() proceed = ask_cli_question('Rebuild ' + ebook_file + '?') else: base = os.path.basename(ebook_file) with TemporaryFile(base+'.zip') as zipf: with ZipFile(zipf, 'w') as zf: zf.add_dir(tdir) try: subprocess.check_call(cmd + [zipf]) except: prints(ed, 'failed, aborting...') raise SystemExit(1) with ZipFile(zipf, 'r') as zf: shutil.rmtree(tdir) os.mkdir(tdir) zf.extractall(path=tdir) proceed = True if proceed: prints('Rebuilding', ebook_file, 'please wait ...') try: rebuilder(tdir, ebook_file) except WorkerError as e: prints('Failed to rebuild', ebook_file) prints(e.orig_tb) raise SystemExit(1) prints(ebook_file, 'successfully tweaked')
def convert(self, recipe_or_file, opts, file_ext, log, accelerators): from calibre.web.feeds.recipes import compile_recipe opts.output_profile.flow_size = 0 if file_ext == 'downloaded_recipe': from calibre.utils.zipfile import ZipFile zf = ZipFile(recipe_or_file, 'r') zf.extractall() zf.close() with lopen('download.recipe', 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) recipe.needs_subscription = False self.recipe_object = recipe(opts, log, self.report_progress) else: if os.environ.get('CALIBRE_RECIPE_URN'): from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id urn = os.environ['CALIBRE_RECIPE_URN'] log('Downloading recipe urn: ' + urn) rtype, recipe_id = urn.partition(':')[::2] if not recipe_id: raise ValueError('Invalid recipe urn: ' + urn) if rtype == 'custom': self.recipe_source = get_custom_recipe(recipe_id) else: self.recipe_source = get_builtin_recipe_by_id( urn, log=log, download_recipe=True) if not self.recipe_source: raise ValueError('Could not find recipe with urn: ' + urn) if not isinstance(self.recipe_source, bytes): self.recipe_source = self.recipe_source.encode('utf-8') recipe = compile_recipe(self.recipe_source) elif os.access(recipe_or_file, os.R_OK): with lopen(recipe_or_file, 'rb') as f: self.recipe_source = f.read() recipe = compile_recipe(self.recipe_source) log('Using custom recipe') else: from calibre.web.feeds.recipes.collection import ( get_builtin_recipe_by_title, get_builtin_recipe_titles) title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = os.path.basename(title).rpartition('.')[0] titles = frozenset(get_builtin_recipe_titles()) if title not in titles: title = getattr(opts, 'original_recipe_input_arg', recipe_or_file) title = title.rpartition('.')[0] raw = get_builtin_recipe_by_title( title, log=log, download_recipe=not opts.dont_download_recipe) builtin = False try: recipe = compile_recipe(raw) self.recipe_source = raw if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % ('.'.join(recipe.requires_version))) builtin = True except: log.exception( 'Failed to compile downloaded recipe. Falling ' 'back to builtin one') builtin = True if builtin: log('Using bundled builtin recipe') raw = get_builtin_recipe_by_title(title, log=log, download_recipe=False) if raw is None: raise ValueError('Failed to find builtin recipe: ' + title) recipe = compile_recipe(raw) self.recipe_source = raw else: log('Using downloaded builtin recipe') if recipe is None: raise ValueError( '%r is not a valid recipe file or builtin recipe' % recipe_or_file) disabled = getattr(recipe, 'recipe_disabled', None) if disabled is not None: raise RecipeDisabled(disabled) ro = recipe(opts, log, self.report_progress) ro.download() self.recipe_object = ro for key, val in self.recipe_object.conversion_options.items(): setattr(opts, key, val) for f in os.listdir('.'): if f.endswith('.opf'): return os.path.abspath(f) for f in walk('.'): if f.endswith('.opf'): return os.path.abspath(f)
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(getcwd()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) opf = os.path.relpath(opf, getcwd()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' def normpath(x): return posixpath.normpath(delta + elem.get('href')) for elem in opf.itermanifest(): elem.set('href', normpath(elem.get('href'))) for elem in opf.iterguide(): elem.set('href', normpath(elem.get('href'))) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover,) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator from calibre.ebooks.chardet import detect from calibre.utils.zipfile import ZipFile from calibre.ebooks.txt.processor import ( convert_basic, convert_markdown_with_metadata, separate_paragraphs_single_line, separate_paragraphs_print_formatted, preserve_spaces, detect_paragraph_type, detect_formatting_type, normalize_line_endings, convert_textile, remove_indents, block_to_single_line, separate_hard_scene_breaks) self.log = log txt = '' log.debug('Reading text from file...') length = 0 base_dir = os.getcwdu() # Extract content from zip archive. if file_ext == 'txtz': zf = ZipFile(stream) zf.extractall('.') for x in walk('.'): if os.path.splitext(x)[1].lower() in ('.txt', '.text'): with open(x, 'rb') as tf: txt += tf.read() + '\n\n' else: if getattr(stream, 'name', None): base_dir = os.path.dirname(stream.name) txt = stream.read() if file_ext in {'md', 'textile', 'markdown'}: options.formatting_type = { 'md': 'markdown' }.get(file_ext, file_ext) log.info('File extension indicates particular formatting. ' 'Forcing formatting type to: %s' % options.formatting_type) options.paragraph_type = 'off' # Get the encoding of the document. if options.input_encoding: ienc = options.input_encoding log.debug('Using user specified input encoding of %s' % ienc) else: det_encoding = detect(txt[:4096]) det_encoding, confidence = det_encoding['encoding'], det_encoding[ 'confidence'] if det_encoding and det_encoding.lower().replace( '_', '-').strip() in ('gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. det_encoding = 'gbk' ienc = det_encoding log.debug( 'Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100)) if not ienc: ienc = 'utf-8' log.debug( 'No input encoding specified and could not auto detect using %s' % ienc) # Remove BOM from start of txt as its presence can confuse markdown import codecs for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): if txt.startswith(bom): txt = txt[len(bom):] break txt = txt.decode(ienc, 'replace') # Replace entities txt = _ent_pat.sub(xml_entity_to_unicode, txt) # Normalize line endings txt = normalize_line_endings(txt) # Determine the paragraph type of the document. if options.paragraph_type == 'auto': options.paragraph_type = detect_paragraph_type(txt) if options.paragraph_type == 'unknown': log.debug( 'Could not reliably determine paragraph type using block') options.paragraph_type = 'block' else: log.debug('Auto detected paragraph type as %s' % options.paragraph_type) # Detect formatting if options.formatting_type == 'auto': options.formatting_type = detect_formatting_type(txt) log.debug('Auto detected formatting as %s' % options.formatting_type) if options.formatting_type == 'heuristic': setattr(options, 'enable_heuristics', True) setattr(options, 'unwrap_lines', False) setattr(options, 'smarten_punctuation', True) # Reformat paragraphs to block formatting based on the detected type. # We don't check for block because the processor assumes block. # single and print at transformed to block for processing. if options.paragraph_type == 'single': txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'print': txt = separate_hard_scene_breaks(txt) txt = separate_paragraphs_print_formatted(txt) txt = block_to_single_line(txt) elif options.paragraph_type == 'unformatted': from calibre.ebooks.conversion.utils import HeuristicProcessor # unwrap lines based on punctuation docanalysis = DocAnalysis('txt', txt) length = docanalysis.line_length(.5) preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None)) txt = preprocessor.punctuation_unwrap(length, txt, 'txt') txt = separate_paragraphs_single_line(txt) elif options.paragraph_type == 'block': txt = separate_hard_scene_breaks(txt) txt = block_to_single_line(txt) if getattr(options, 'enable_heuristics', False) and getattr( options, 'dehyphenate', False): docanalysis = DocAnalysis('txt', txt) if not length: length = docanalysis.line_length(.5) dehyphenator = Dehyphenator(options.verbose, log=self.log) txt = dehyphenator(txt, 'txt', length) # User requested transformation on the text. if options.txt_in_remove_indents: txt = remove_indents(txt) # Preserve spaces will replace multiple spaces to a space # followed by the entity. if options.preserve_spaces: txt = preserve_spaces(txt) # Process the text using the appropriate text processor. self.shifted_files = [] try: html = '' input_mi = None if options.formatting_type == 'markdown': log.debug('Running text through markdown conversion...') try: input_mi, html = convert_markdown_with_metadata( txt, extensions=[ x.strip() for x in options.markdown_extensions.split(',') if x.strip() ]) except RuntimeError: raise ValueError( 'This txt file has malformed markup, it cannot be' ' converted by calibre. See https://daringfireball.net/projects/markdown/syntax' ) html = self.fix_resources(html, base_dir) elif options.formatting_type == 'textile': log.debug('Running text through textile conversion...') html = convert_textile(txt) html = self.fix_resources(html, base_dir) else: log.debug('Running text through basic conversion...') flow_size = getattr(options, 'flow_size', 0) html = convert_basic(txt, epub_split_size_kb=flow_size) # Run the HTMLized text through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' htmlfile = self.shift_file(base_dir, 'index.html', html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi finally: for x in self.shifted_files: os.remove(x) # Set metadata from file. if input_mi is None: from calibre.customize.ui import get_file_type_metadata input_mi = get_file_type_metadata(stream, file_ext) from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata meta_info_to_oeb_metadata(input_mi, oeb.metadata, log) self.html_postprocess_title = input_mi.title return oeb
def compile_website_translations(self): from calibre.utils.zipfile import ZipFile, ZipInfo, ZIP_STORED from calibre.ptempfile import TemporaryDirectory from calibre.utils.localization import get_iso639_translator, get_language, get_iso_language self.info('Compiling website translations...') srcbase = self.j(self.d(self.SRC), 'translations', 'website') fmap = {} files = [] stats = {} done = [] def handle_stats(src, nums): locale = fmap[src] trans = nums[0] total = trans if len(nums) == 1 else (trans + nums[1]) stats[locale] = int(round(100 * trans / total)) with TemporaryDirectory() as tdir, ZipFile( self.j(srcbase, 'locales.zip'), 'w', ZIP_STORED) as zf: for f in os.listdir(srcbase): if f.endswith('.po'): l = f.partition('.')[0] pf = l.split('_')[0] if pf in {'en'}: continue d = os.path.join(tdir, l + '.mo') f = os.path.join(srcbase, f) fmap[f] = l files.append((f, d)) self.compile_group(files, handle_stats=handle_stats) for locale, translated in iteritems(stats): if translated >= 20: with open(os.path.join(tdir, locale + '.mo'), 'rb') as f: raw = f.read() zi = ZipInfo(os.path.basename(f.name)) zi.compress_type = ZIP_STORED zf.writestr(zi, raw) done.append(locale) dl = done + ['en'] lang_names = {} for l in dl: if l == 'en': t = get_language else: t = getattr(get_iso639_translator(l), 'gettext' if ispy3 else 'ugettext') t = partial(get_iso_language, t) lang_names[l] = {x: t(x) for x in dl} zi = ZipInfo('lang-names.json') zi.compress_type = ZIP_STORED zf.writestr( zi, json.dumps(lang_names, ensure_ascii=False).encode('utf-8')) dest = self.j(self.d(self.stats), 'website-languages.txt') data = ' '.join(sorted(done)) if not isinstance(data, bytes): data = data.encode('utf-8') with open(dest, 'wb') as f: f.write(data)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory('_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = 'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to( 100, (ascii_filename(str(oeb_book.metadata.title[0])), ))[0] with open(os.path.join(tdir, fname + '.html'), 'wb') as tf: if isinstance(html, str): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, 'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book).encode('utf-8')) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, 'images')): os.makedirs(os.path.join(tdir, 'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring( item.data, encoding='unicode').encode('utf-8') else: data = item.data fname = os.path.join(tdir, 'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, 'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf: opf = OPF( io.BytesIO( etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = 'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def books(self, oncard=None, end_session=True): ''' Return a list of ebooks on the device. @param oncard: If 'carda' or 'cardb' return a list of ebooks on the specific storage card, otherwise return list of ebooks in main memory of device. If a card is specified and no books are on the card return empty list. @return: A BookList. ''' if oncard: return BookList() self._log_location() booklist = BookList() cached_books = {} # Fetch current assets from Media folder assets_profile = self._localize_database_path(self.assets_subpath) #Fetch current metadata from iBooks's DB db_profile = self._localize_database_path(self.books_subpath) con = sqlite3.connect(db_profile['path']) # Mount the Media folder self.ios.mount_ios_media_folder() # Get Books.plist so we can find the covers books_plist = {} if True: raw_plist = XmlPropertyListParser().parse(self.ios.read('/Books/Sync/Books.plist'))['Books'] for book in raw_plist: if not 'Path' in book: print(" No 'Path' element found for '%s' by '%s'" % (book['Name'], book['Artist'])) #print(book) #print continue if 'Cover Path' in book: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', book['Path'], book['Cover Path']])) else: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', 'Sync', 'Artwork', book['Persistent ID']])) # Process any outliers raw_plist = XmlPropertyListParser().parse(self.ios.read('/Books/Books.plist'))['Books'] for book in raw_plist: if not 'Path' in book: print(" No 'Path' element found for '%s' by '%s'" % (book['Name'], book['Artist'])) #print(book) #print continue # Don't overwrite existing cover_paths if not '/'.join(['/Books', book['Path']]) in books_plist: if 'Cover Path' in book and not ['/'.join(['/Books', book['Path']])] in book_plist: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', book['Path'], book['Cover Path']])) else: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', 'Sync', 'Artwork', book['Persistent ID']])) raw_plist = XmlPropertyListParser().parse(self.ios.read('/Books/Purchases/Purchases.plist'))['Books'] for book in raw_plist: if not 'Path' in book: print(" No 'Path' element found for '%s' by '%s'" % (book['Name'], book['Artist'])) print(book) print continue # Don't overwrite existing cover_paths if not '/'.join(['/Books', book['Path']]) in books_plist: if 'Cover Path' in book: books_plist['/'.join(['/Books/Purchases', book['Path']])] = unicode('/'.join(['/Books/Purchases', book['Path'], book['Cover Path']])) else: books_plist['/'.join(['/Books/Purchases', book['Path']])] = unicode('/'.join(['/Books', 'Sync', 'Artwork', book['Persistent ID']])) else: raw_plist = XmlPropertyListParser().parse(self.ios.read('/Books/Books.plist'))['Books'] for book in raw_plist: if not 'Path' in book: print(" No 'Path' element found for '%s' by '%s'" % (book['Name'], book['Artist'])) print(book) print continue if 'Cover Path' in book: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', book['Path'], book['Cover Path']])) else: books_plist['/'.join(['/Books', book['Path']])] = unicode('/'.join(['/Books', 'Sync', 'Artwork', book['Persistent ID']])) raw_plist = XmlPropertyListParser().parse(self.ios.read('/Books/Purchases/Purchases.plist'))['Books'] for book in raw_plist: if not 'Path' in book: print(" No 'Path' element found for '%s' by '%s'" % (book['Name'], book['Artist'])) print(book) print continue if 'Cover Path' in book: books_plist['/'.join(['/Books/Purchases', book['Path']])] = unicode('/'.join(['/Books/Purchases', book['Path'], book['Cover Path']])) else: books_plist['/'.join(['/Books/Purchases', book['Path']])] = unicode('/'.join(['/Books', 'Sync', 'Artwork', book['Persistent ID']])) print(books_plist) with con: con.row_factory = sqlite3.Row # Build a collection map collections_map = {} # Get the books cur = con.cursor() #cur.execute("ATTACH DATABASE '{0}' as 'ASSETS'".format(assets_profile['path']) cur.execute('''SELECT ZASSETURL, ZBOOKAUTHOR, ZSORTAUTHOR, ZBOOKTITLE, ZSORTTITLE, ZDATABASEKEY, ZDATEADDED FROM ZBKBOOKINFO WHERE ZASSETURL LIKE 'file://localhost%' AND ZASSETURL LIKE '%.epub/' ''') rows = cur.fetchall() book_count = len(rows) for i, row in enumerate(rows): book_id = row[b'ZDATABASEKEY'] # Get the collection assignments collections = [] # Get the primary metadata this_book = Book(row[b'ZBOOKTITLE'], row[b'ZBOOKAUTHOR']) original_path = row[b'ZASSETURL'] path = original_path[original_path.find('Media/') + len('Media'):-1] this_book.path = path.replace('%20', ' ') timestamp = int(row[b'ZDATEADDED']) + NSTimeIntervalSince1970 this_book.datetime = datetime.fromtimestamp(timestamp).timetuple() this_book.device_collections = collections this_book.uuid = None this_book.thumbnail = self._generate_thumbnail(this_book, books_plist[this_book.path]) # Retrieve folder size from cache or compute and cache try: zfr = ZipFile(self.folder_archive_path) file_size = zfr.read(this_book.path) this_book.size = int(file_size) self._log_diagnostic("returning folder size from cache") except: self._log_diagnostic("opening folder cache for appending") zfw = ZipFile(self.folder_archive_path, mode='a') stats = self.ios.stat(this_book.path) this_book.size = self.ios.get_folder_size(this_book.path) zfw.writestr(this_book.path, str(this_book.size)) zfw.close() finally: zfr.close() booklist.add_book(this_book, False) if self.report_progress is not None: self.report_progress(float((i + 1)*100 / book_count)/100, '%(num)d of %(tot)d' % dict(num=i + 1, tot=book_count)) cached_books[this_book.path] = { 'title': this_book.title, 'author': this_book.author, 'authors': this_book.author.split(' & '), 'uuid': this_book.uuid } cur.close() # Close the connection self.ios.dismount_ios_media_folder() if self.report_progress is not None: self.report_progress(1.0, _('finished')) self.cached_books = cached_books return booklist
def init_zipfile(self, stream): self.zipf = ZipFile(stream) self.names = frozenset(self.zipf.namelist())
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath(f'//ns0:{tag}', namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() def find_all(field): ns, tag = fields[field] for x in root.xpath(f'//ns0:{tag}', namespaces={'ns0': ns}): yield normalize( tostring(x, method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] keywords = [] for q in ('keyword', 'keywords'): for kw in find_all(q): keywords += [x.strip() for x in kw.split(',') if x.strip()] mi.tags = uniq(keywords) data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{0}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace( '_TITLE_', prepare_string_for_xml(mi.title)).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string( mi.authors))).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def run(self, path_to_output, opts, db, notification=DummyReporter()): from calibre.library.catalogs.epub_mobi_builder import CatalogBuilder from calibre.utils.logging import default_log as log from calibre.utils.config import JSONConfig # If preset specified from the cli, insert stored options from JSON file if hasattr(opts, 'preset') and opts.preset: available_presets = JSONConfig("catalog_presets") if opts.preset not in available_presets: if available_presets: print(_('Error: Preset "%s" not found.' % opts.preset)) print( _('Stored presets: %s' % ', '.join( [p for p in sorted(available_presets.keys())]))) else: print(_('Error: No stored presets.')) return 1 # Copy the relevant preset values to the opts object for item in available_presets[opts.preset]: if item not in [ 'exclusion_rules_tw', 'format', 'prefix_rules_tw' ]: setattr(opts, item, available_presets[opts.preset][item]) # Provide an unconnected device opts.connected_device = { 'is_device_connected': False, 'kind': None, 'name': None, 'save_template': None, 'serial': None, 'storage': None, } # Convert prefix_rules and exclusion_rules from JSON lists to tuples prs = [] for rule in opts.prefix_rules: prs.append(tuple(rule)) opts.prefix_rules = tuple(prs) ers = [] for rule in opts.exclusion_rules: ers.append(tuple(rule)) opts.exclusion_rules = tuple(ers) opts.log = log opts.fmt = self.fmt = path_to_output.rpartition('.')[2] # Add local options opts.creator = '%s, %s %s, %s' % (strftime('%A'), strftime('%B'), strftime('%d').lstrip('0'), strftime('%Y')) opts.creator_sort_as = '%s %s' % ('calibre', strftime('%Y-%m-%d')) opts.connected_kindle = False # Finalize output_profile op = opts.output_profile if op is None: op = 'default' if opts.connected_device['name'] and 'kindle' in opts.connected_device[ 'name'].lower(): opts.connected_kindle = True if opts.connected_device['serial'] and \ opts.connected_device['serial'][:4] in ['B004', 'B005']: op = "kindle_dx" else: op = "kindle" opts.description_clip = 380 if op.endswith( 'dx') or 'kindle' not in op else 100 opts.author_clip = 100 if op.endswith( 'dx') or 'kindle' not in op else 60 opts.output_profile = op opts.basename = "Catalog" opts.cli_environment = not hasattr(opts, 'sync') # Hard-wired to always sort descriptions by author, with series after non-series opts.sort_descriptions_by_author = True build_log = [] build_log.append( "%s('%s'): Generating %s %sin %s environment, locale: '%s'" % (self.name, current_library_name(), self.fmt, 'for %s ' % opts.output_profile if opts.output_profile else '', 'CLI' if opts.cli_environment else 'GUI', calibre_langcode_to_name(canonicalize_lang(get_lang()), localize=False))) # If exclude_genre is blank, assume user wants all tags as genres if opts.exclude_genre.strip() == '': # opts.exclude_genre = '\[^.\]' # build_log.append(" converting empty exclude_genre to '\[^.\]'") opts.exclude_genre = 'a^' build_log.append(" converting empty exclude_genre to 'a^'") if opts.connected_device['is_device_connected'] and \ opts.connected_device['kind'] == 'device': if opts.connected_device['serial']: build_log.append(" connected_device: '%s' #%s%s " % (opts.connected_device['name'], opts.connected_device['serial'][0:4], 'x' * (len(opts.connected_device['serial']) - 4))) for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) try: for storage in opts.connected_device['storage']: if storage: build_log.append(" mount point: %s" % storage) except: build_log.append(" (no mount points)") else: build_log.append(" connected_device: '%s'" % opts.connected_device['name']) opts_dict = vars(opts) if opts_dict['ids']: build_log.append(" book count: %d" % len(opts_dict['ids'])) sections_list = [] if opts.generate_authors: sections_list.append('Authors') if opts.generate_titles: sections_list.append('Titles') if opts.generate_series: sections_list.append('Series') if opts.generate_genres: sections_list.append('Genres') if opts.generate_recently_added: sections_list.append('Recently Added') if opts.generate_descriptions: sections_list.append('Descriptions') if not sections_list: if opts.cli_environment: opts.log.warn( '*** No Section switches specified, enabling all Sections ***' ) opts.generate_authors = True opts.generate_titles = True opts.generate_series = True opts.generate_genres = True opts.generate_recently_added = True opts.generate_descriptions = True sections_list = [ 'Authors', 'Titles', 'Series', 'Genres', 'Recently Added', 'Descriptions' ] else: opts.log.warn( '\n*** No enabled Sections, terminating catalog generation ***' ) return [ "No Included Sections", "No enabled Sections.\nCheck E-book options tab\n'Included sections'\n" ] if opts.fmt == 'mobi' and sections_list == ['Descriptions']: warning = _( "\n*** Adding 'By authors' section required for MOBI output ***" ) opts.log.warn(warning) sections_list.insert(0, 'Authors') opts.generate_authors = True opts.log(" Sections: %s" % ', '.join(sections_list)) opts.section_list = sections_list # Limit thumb_width to 1.0" - 2.0" try: if float(opts.thumb_width) < float(self.THUMB_SMALLEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = self.THUMB_SMALLEST if float(opts.thumb_width) > float(self.THUMB_LARGEST): log.warning("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_LARGEST)) opts.thumb_width = self.THUMB_LARGEST opts.thumb_width = "%.2f" % float(opts.thumb_width) except: log.error("coercing thumb_width from '%s' to '%s'" % (opts.thumb_width, self.THUMB_SMALLEST)) opts.thumb_width = "1.0" # eval prefix_rules if passed from command line if type(opts.prefix_rules) is not tuple: try: opts.prefix_rules = eval(opts.prefix_rules) except: log.error("malformed --prefix-rules: %s" % opts.prefix_rules) raise for rule in opts.prefix_rules: if len(rule) != 4: log.error( "incorrect number of args for --prefix-rules: %s" % repr(rule)) # eval exclusion_rules if passed from command line if type(opts.exclusion_rules) is not tuple: try: opts.exclusion_rules = eval(opts.exclusion_rules) except: log.error("malformed --exclusion-rules: %s" % opts.exclusion_rules) raise for rule in opts.exclusion_rules: if len(rule) != 3: log.error( "incorrect number of args for --exclusion-rules: %s" % repr(rule)) # Display opts keys = sorted(opts_dict.keys()) build_log.append(" opts:") for key in keys: if key in [ 'catalog_title', 'author_clip', 'connected_kindle', 'creator', 'cross_reference_authors', 'description_clip', 'exclude_book_marker', 'exclude_genre', 'exclude_tags', 'exclusion_rules', 'fmt', 'genre_source_field', 'header_note_source_field', 'merge_comments_rule', 'output_profile', 'prefix_rules', 'preset', 'read_book_marker', 'search_text', 'sort_by', 'sort_descriptions_by_author', 'sync', 'thumb_width', 'use_existing_cover', 'wishlist_tag' ]: build_log.append(" %s: %s" % (key, repr(opts_dict[key]))) if opts.verbose: log('\n'.join(line for line in build_log)) # Capture start_time opts.start_time = time.time() self.opts = opts if opts.verbose: log.info(" Begin catalog source generation (%s)" % str( datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # Launch the Catalog builder catalog = CatalogBuilder(db, opts, self, report_progress=notification) try: catalog.build_sources() if opts.verbose: log.info(" Completed catalog source generation (%s)\n" % str( datetime.timedelta(seconds=int(time.time() - opts.start_time)))) except (AuthorSortMismatchException, EmptyCatalogException) as e: log.error(" *** Terminated catalog generation: %s ***" % e) except: log.error(" unhandled exception in catalog generator") raise else: recommendations = [] recommendations.append( ('remove_fake_margins', False, OptionRecommendation.HIGH)) recommendations.append(('comments', '', OptionRecommendation.HIGH)) """ >>> Use to debug generated catalog code before pipeline conversion <<< """ GENERATE_DEBUG_EPUB = False if GENERATE_DEBUG_EPUB: catalog_debug_path = os.path.join(os.path.expanduser('~'), 'Desktop', 'Catalog debug') setattr(opts, 'debug_pipeline', os.path.expanduser(catalog_debug_path)) dp = getattr(opts, 'debug_pipeline', None) if dp is not None: recommendations.append( ('debug_pipeline', dp, OptionRecommendation.HIGH)) if opts.output_profile and opts.output_profile.startswith( "kindle"): recommendations.append(('output_profile', opts.output_profile, OptionRecommendation.HIGH)) recommendations.append(('book_producer', opts.output_profile, OptionRecommendation.HIGH)) if opts.fmt == 'mobi': recommendations.append( ('no_inline_toc', True, OptionRecommendation.HIGH)) recommendations.append( ('verbose', 2, OptionRecommendation.HIGH)) # Use existing cover or generate new cover cpath = None existing_cover = False try: search_text = 'title:"%s" author:%s' % ( opts.catalog_title.replace('"', '\\"'), 'calibre') matches = db.search(search_text, return_matches=True, sort_results=False) if matches: cpath = db.cover(matches[0], index_is_id=True, as_path=True) if cpath and os.path.exists(cpath): existing_cover = True except: pass if self.opts.use_existing_cover and not existing_cover: log.warning("no existing catalog cover found") if self.opts.use_existing_cover and existing_cover: recommendations.append( ('cover', cpath, OptionRecommendation.HIGH)) log.info("using existing catalog cover") else: from calibre.ebooks.covers import calibre_cover2 log.info("replacing catalog cover") new_cover_path = PersistentTemporaryFile(suffix='.jpg') new_cover = calibre_cover2(opts.catalog_title, 'calibre') new_cover_path.write(new_cover) new_cover_path.close() recommendations.append( ('cover', new_cover_path.name, OptionRecommendation.HIGH)) # Run ebook-convert from calibre.ebooks.conversion.plumber import Plumber plumber = Plumber(os.path.join(catalog.catalog_path, opts.basename + '.opf'), path_to_output, log, report_progress=notification, abort_after_input_dump=False) plumber.merge_ui_recommendations(recommendations) plumber.run() try: os.remove(cpath) except: pass if GENERATE_DEBUG_EPUB: from calibre.ebooks.epub import initialize_container from calibre.ebooks.tweak import zip_rebuilder from calibre.utils.zipfile import ZipFile input_path = os.path.join(catalog_debug_path, 'input') epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip') initialize_container(epub_shell, opf_name='content.opf') with ZipFile(epub_shell, 'r') as zf: zf.extractall(path=input_path) os.remove(epub_shell) zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub')) if opts.verbose: log.info(" Catalog creation complete (%s)\n" % str( datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # returns to gui2.actions.catalog:catalog_generated() return catalog.error
def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(unicode(os.getcwd())) except Exception: log.exception( "KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser" ) from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk("."): if ( f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".") ): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)" ).format(path) ) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, unicode(os.getcwd())) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = ( self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 ) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported") # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath("content.opf")
from calibre.ebooks.conversion.plumber import Plumber plumber = Plumber(os.path.join(catalog.catalog_path, opts.basename + '.opf'), path_to_output, log, report_progress=notification, abort_after_input_dump=False) plumber.merge_ui_recommendations(recommendations) plumber.run() try: os.remove(cpath) except: pass if GENERATE_DEBUG_EPUB: from calibre.ebooks.epub import initialize_container from calibre.ebooks.tweak import zip_rebuilder from calibre.utils.zipfile import ZipFile input_path = os.path.join(catalog_debug_path, 'input') epub_shell = os.path.join(catalog_debug_path, 'epub_shell.zip') initialize_container(epub_shell, opf_name='content.opf') with ZipFile(epub_shell, 'r') as zf: zf.extractall(path=input_path) os.remove(epub_shell) zip_rebuilder(input_path, os.path.join(catalog_debug_path, 'input.epub')) if opts.verbose: log.info(" Catalog creation complete (%s)\n" % str(datetime.timedelta(seconds=int(time.time() - opts.start_time)))) # returns to gui2.actions.catalog:catalog_generated() return catalog.error
def _generate_thumbnail(self, book, cover_path): ''' Fetch the cover image, generate a thumbnail, cache Specific implementation for iBooks ''' self._log_location(book.title) self._log_diagnostic(" book_path: %s" % book.path) self._log_diagnostic("cover_path: %s" % repr(cover_path)) thumb_data = None thumb_path = book.path.rpartition('.')[0] + '.jpg' # Try getting the cover from the cache try: zfr = ZipFile(self.archive_path) thumb_data = zfr.read(thumb_path) if thumb_data == 'None': self._log_diagnostic("returning None from cover cache") zfr.close() return None except: self._log_diagnostic("opening cover cache for appending") zfw = ZipFile(self.archive_path, mode='a') else: self._log_diagnostic("returning thumb from cover cache") return thumb_data ''' # Is book.path a directory (iBooks) or an epub? stats = self.ios.stat(book.path) if stats['st_ifmt'] == 'S_IFDIR': # *** This needs to fetch the cover data from the directory *** self._log_diagnostic("returning None, can't read iBooks covers yet") return thumb_data # Get the cover from the book try: stream = cStringIO.StringIO(self.ios.read(book.path, mode='rb')) mi = get_metadata(stream) if mi.cover_data is not None: img_data = cStringIO.StringIO(mi.cover_data[1]) except: if self.verbose: self._log_diagnostic("ERROR: unable to get cover from '%s'" % book.title) import traceback #traceback.print_exc() exc_type, exc_value, exc_traceback = sys.exc_info() self._log_diagnostic(traceback.format_exception_only(exc_type, exc_value)[0].strip()) return thumb_data ''' try: img_data = cStringIO.StringIO(self.ios.read(cover_path, mode='rb')) except: if self.verbose: self._log_diagnostic("ERROR fetching cover data for '%s', caching empty marker" % book.title) import traceback exc_type, exc_value, exc_traceback = sys.exc_info() self._log_diagnostic(traceback.format_exception_only(exc_type, exc_value)[0].strip()) # Cache the empty cover zfw.writestr(thumb_path, 'None') return thumb_data # Generate a thumb try: im = PILImage.open(img_data) scaled, width, height = fit_image(im.size[0], im.size[1], 60, 80) im = im.resize((int(width), int(height)), PILImage.ANTIALIAS) thumb = cStringIO.StringIO() im.convert('RGB').save(thumb, 'JPEG') thumb_data = thumb.getvalue() thumb.close() self._log_diagnostic("SUCCESS: generated thumb for '%s', caching" % book.title) # Cache the tagged thumb zfw.writestr(thumb_path, thumb_data) except: if self.verbose: self._log_diagnostic("ERROR generating thumb for '%s', caching empty marker" % book.title) import traceback exc_type, exc_value, exc_traceback = sys.exc_info() self._log_diagnostic(traceback.format_exception_only(exc_type, exc_value)[0].strip()) # Cache the empty cover zfw.writestr(thumb_path, 'None') finally: #img_data.close() zfw.close() return thumb_data
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() htmlfile = os.path.join(base, u'index.html') c = 0 while os.path.exists(htmlfile): c += 1 htmlfile = u'index%d.html'%c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def convert(self, oeb, output_path, input_plugin, opts, log): self.log, self.opts, self.oeb = log, opts, oeb if self.opts.epub_inline_toc: from calibre.ebooks.mobi.writer8.toc import TOCAdder opts.mobi_toc_at_start = not opts.epub_toc_at_end opts.mobi_passthrough = False opts.no_inline_toc = False TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True) if self.opts.epub_flatten: from calibre.ebooks.oeb.transforms.filenames import FlatFilenames FlatFilenames()(oeb, opts) else: from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames UniqueFilenames()(oeb, opts) self.workaround_ade_quirks() self.workaround_webkit_quirks() self.upshift_markup() from calibre.ebooks.oeb.transforms.rescale import RescaleImages RescaleImages(check_colorspaces=True)(oeb, opts) from calibre.ebooks.oeb.transforms.split import Split split = Split(not self.opts.dont_split_on_page_breaks, max_flow_size=self.opts.flow_size * 1024) split(self.oeb, self.opts) from calibre.ebooks.oeb.transforms.cover import CoverManager cm = CoverManager( no_default_cover=self.opts.no_default_epub_cover, no_svg_cover=self.opts.no_svg_cover, preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio) cm(self.oeb, self.opts, self.log) self.workaround_sony_quirks() if self.oeb.toc.count() == 0: self.log.warn('This EPUB file has no Table of Contents. ' 'Creating a default TOC') first = iter(self.oeb.spine).next() self.oeb.toc.add(_('Start'), first.href) from calibre.ebooks.oeb.base import OPF identifiers = oeb.metadata['identifier'] uuid = None for x in identifiers: if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode( x).startswith('urn:uuid:'): uuid = unicode(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) if uuid is None: self.log.warn('No UUID identifier found') from uuid import uuid4 uuid = str(uuid4()) oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) if encrypted_fonts and not uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: if unicode(x) == uuid: x.content = 'urn:uuid:' + uuid with TemporaryDirectory(u'_epub_output') as tdir: from calibre.customize.ui import plugin_for_output_format metadata_xml = None extra_entries = [] if self.is_periodical: if self.opts.output_profile.epub_periodical_format == 'sony': from calibre.ebooks.epub.periodical import sony_metadata metadata_xml, atom_xml = sony_metadata(oeb) extra_entries = [(u'atom.xml', 'application/atom+xml', atom_xml)] oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb, tdir, input_plugin, opts, log) opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0] self.condense_ncx([ os.path.join(tdir, x) for x in os.listdir(tdir) if x.endswith('.ncx') ][0]) encryption = None if encrypted_fonts: encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) from calibre.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: epub.writestr('META-INF/encryption.xml', encryption) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) if opts.extract_to is not None: from calibre.utils.zipfile import ZipFile if os.path.exists(opts.extract_to): if os.path.isdir(opts.extract_to): shutil.rmtree(opts.extract_to) else: os.remove(opts.extract_to) os.mkdir(opts.extract_to) with ZipFile(output_path) as zf: zf.extractall(path=opts.extract_to) self.log.info('EPUB extracted to', opts.extract_to)
def create_book(mi, path, fmt='epub', opf_name='metadata.opf', html_name='start.xhtml', toc_name='toc.ncx'): ''' Create an empty book in the specified format at the specified location. ''' if fmt not in valid_empty_formats: raise ValueError('Cannot create empty book in the %s format' % fmt) if fmt == 'txt': with open(path, 'wb') as f: if not mi.is_null('title'): f.write(as_bytes(mi.title)) return if fmt == 'md': with open(path, 'w', encoding='utf-8') as f: if not mi.is_null('title'): print('#', mi.title, file=f) return if fmt == 'docx': from calibre.ebooks.conversion.plumber import Plumber from calibre.ebooks.docx.writer.container import DOCX from calibre.utils.logging import default_log p = Plumber('a.docx', 'b.docx', default_log) p.setup_options() # Use the word default of one inch page margins for x in 'left right top bottom'.split(): setattr(p.opts, 'margin_' + x, 72) DOCX(p.opts, default_log).write(path, mi, create_empty_document=True) return path = os.path.abspath(path) lang = 'und' opf = metadata_to_opf(mi, as_string=False) for l in opf.xpath('//*[local-name()="language"]'): if l.text: lang = l.text break lang = lang_as_iso639_1(lang) or lang opfns = OPF_NAMESPACES['opf'] m = opf.makeelement('{%s}manifest' % opfns) opf.insert(1, m) i = m.makeelement('{%s}item' % opfns, href=html_name, id='start') i.set('media-type', guess_type('a.xhtml')) m.append(i) i = m.makeelement('{%s}item' % opfns, href=toc_name, id='ncx') i.set('media-type', guess_type(toc_name)) m.append(i) s = opf.makeelement('{%s}spine' % opfns, toc="ncx") opf.insert(2, s) i = s.makeelement('{%s}itemref' % opfns, idref='start') s.append(i) CONTAINER = '''\ <?xml version="1.0"?> <container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> <rootfiles> <rootfile full-path="{}" media-type="application/oebps-package+xml"/> </rootfiles> </container> '''.format(prepare_string_for_xml(opf_name, True)).encode('utf-8') HTML = P('templates/new_book.html', data=True).decode('utf-8').replace( '_LANGUAGE_', prepare_string_for_xml(lang, True)).replace( '_TITLE_', prepare_string_for_xml(mi.title)).replace( '_AUTHORS_', prepare_string_for_xml(authors_to_string( mi.authors))).encode('utf-8') h = parse(HTML) pretty_html_tree(None, h) HTML = serialize(h, 'text/html') ncx = etree.tostring(create_toc(mi, opf, html_name, lang), encoding='utf-8', xml_declaration=True, pretty_print=True) pretty_xml_tree(opf) opf = etree.tostring(opf, encoding='utf-8', xml_declaration=True, pretty_print=True) if fmt == 'azw3': with TemporaryDirectory('create-azw3') as tdir, CurrentDir(tdir): for name, data in ((opf_name, opf), (html_name, HTML), (toc_name, ncx)): with open(name, 'wb') as f: f.write(data) c = Container(os.path.dirname(os.path.abspath(opf_name)), opf_name, DevNull()) opf_to_azw3(opf_name, path, c) else: with ZipFile(path, 'w', compression=ZIP_STORED) as zf: zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) zf.writestr('META-INF/', b'', 0o755) zf.writestr('META-INF/container.xml', CONTAINER) zf.writestr(opf_name, opf) zf.writestr(html_name, HTML) zf.writestr(toc_name, ncx)
def create_themeball(report, progress=None, abort=None): pool = ThreadPool(processes=cpu_count()) buf = BytesIO() num = count() error_occurred = Event() def optimize(name): if abort is not None and abort.is_set(): return if error_occurred.is_set(): return try: i = next(num) if progress is not None: progress(i, _('Optimizing %s') % name) srcpath = os.path.join(report.path, name) ext = srcpath.rpartition('.')[-1].lower() if ext == 'png': optimize_png(srcpath) elif ext in ('jpg', 'jpeg'): optimize_jpeg(srcpath) except Exception: return sys.exc_info() errors = tuple( filter(None, pool.map(optimize, tuple(report.name_map.iterkeys())))) pool.close(), pool.join() if abort is not None and abort.is_set(): return if errors: e = errors[0] reraise(*e) if progress is not None: progress(next(num), _('Creating theme file')) with ZipFile(buf, 'w') as zf: for name in report.name_map: srcpath = os.path.join(report.path, name) with lopen(srcpath, 'rb') as f: zf.writestr(name, f.read(), compression=ZIP_STORED) buf.seek(0) out = BytesIO() if abort is not None and abort.is_set(): return None, None if progress is not None: progress(next(num), _('Compressing theme file')) compress(buf, out, level=9) buf = BytesIO() prefix = report.name if abort is not None and abort.is_set(): return None, None with ZipFile(buf, 'w') as zf: with lopen(os.path.join(report.path, THEME_METADATA), 'rb') as f: zf.writestr(prefix + '/' + THEME_METADATA, f.read()) zf.writestr(prefix + '/' + THEME_COVER, create_cover(report)) zf.writestr(prefix + '/' + 'icons.zip.xz', out.getvalue(), compression=ZIP_STORED) if progress is not None: progress(next(num), _('Finished')) return buf.getvalue(), prefix
def do_dump(path, dest): if os.path.exists(dest): shutil.rmtree(dest) with ZipFile(path) as zf: zf.extractall(dest) pretty_all_xml_in_dir(dest)
def _get_epub_toc(self, cid=None, path=None, prepend_title=None): ''' Given a calibre id, return the epub TOC indexed by section If cid, use copy in library, else use path to copy on device ''' toc = None # if cid is not None: # mi = self.opts.gui.current_db.get_metadata(cid, index_is_id=True) # toc = None # if 'EPUB' in mi.formats: # fpath = self.opts.gui.current_db.format(cid, 'EPUB', index_is_id=True, as_path=True) # else: # return toc # elif path is not None: # fpath = path # else: # return toc fpath = path # iBooks stores books unzipped # Marvin stores books zipped # Need spine, ncx_tree to construct toc if self.ios.stat(fpath) and self.ios.stat(fpath)['st_ifmt'] == 'S_IFDIR': # Find the OPF in the unzipped ePub fp = '/'.join([fpath, 'META-INF', 'container.xml']) cf = cStringIO.StringIO(self.ios.read(fp)) container = etree.parse(cf) opf_file = container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path') oebps = opf_file.rpartition('/')[0] fp = '/'.join([fpath, opf_file]) opf = cStringIO.StringIO(self.ios.read(fp)) opf_tree = etree.parse(opf) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx_file = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') fp = '/'.join([fpath, oebps, ncx_file]) ncxf = cStringIO.StringIO(self.ios.read(fp)) ncx_tree = etree.parse(ncxf) #self._log(etree.tostring(ncx_tree, pretty_print=True)) else: # Find the OPF file in the zipped ePub zfo = cStringIO.StringIO(self.ios.read(fpath, mode='rb')) try: zf = ZipFile(zfo, 'r') container = etree.fromstring(zf.read('META-INF/container.xml')) opf_tree = etree.fromstring(zf.read(container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path'))) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') # Find the ncx file fnames = zf.namelist() _ncx = [x for x in fnames if ncx in x][0] ncx_tree = etree.fromstring(zf.read(_ncx)) except: import traceback self._log_location() self._log(" unable to unzip '%s'" % fpath) self._log(traceback.format_exc()) return toc # fpath points to epub (zipped or unzipped dir) # spine, ncx_tree populated try: toc = OrderedDict() # 1. capture idrefs from spine for i, el in enumerate(spine): toc[str(i)] = el.get('idref') # 2. Resolve <spine> idrefs to <manifest> hrefs for el in toc: toc[el] = manifest.find('.//*[@id="%s"]' % toc[el]).get('href') # 3. Build a dict of src:toc_entry src_map = OrderedDict() navMap = ncx_tree.xpath('.//*[local-name()="navMap"]')[0] for navPoint in navMap: # Get the first-level entry src = re.sub(r'#.*$', '', navPoint.xpath('.//*[local-name()="content"]')[0].get('src')) toc_entry = navPoint.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Get any nested navPoints nested_navPts = navPoint.xpath('.//*[local-name()="navPoint"]') for nnp in nested_navPts: src = re.sub(r'#.*$', '', nnp.xpath('.//*[local-name()="content"]')[0].get('src')) toc_entry = nnp.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Resolve src paths to toc_entry for section in toc: if toc[section] in src_map: if prepend_title: toc[section] = "%s · %s" % (prepend_title, src_map[toc[section]]) else: toc[section] = src_map[toc[section]] else: toc[section] = None # 5. Fill in the gaps current_toc_entry = None for section in toc: if toc[section] is None: toc[section] = current_toc_entry else: current_toc_entry = toc[section] except: import traceback self._log_location() self._log("{:~^80}".format(" error parsing '%s' " % fpath)) self._log(traceback.format_exc()) self._log("{:~^80}".format(" end traceback ")) return toc