def read_embedded_metadata(self, root, elem, guide): raw = '<?xml version="1.0" encoding="utf-8" ?>\n<package>' + \ html.tostring(elem, encoding='utf-8') + '</package>' stream = cStringIO.StringIO(raw) opf = OPF(stream) self.embedded_mi = opf.to_book_metadata() if guide is not None: for ref in guide.xpath('descendant::reference'): if 'cover' in ref.get('type', '').lower(): href = ref.get('href', '') if href.startswith('#'): href = href[1:] anchors = root.xpath('//*[@id="%s"]' % href) if anchors: cpos = anchors[0] reached = False for elem in root.iter(): if elem is cpos: reached = True if reached and elem.tag == 'img': cover = elem.get('src', None) self.embedded_mi.cover = cover elem.getparent().remove(elem) break break
def handle_zip_of_opf_files(self, stream): ''' Given a zip up of a bunch of opf files, either merge them or add them to library ''' result = {'updated':0, 'added':0} with ZipFile(stream, 'r') as zf: self.start_applying_updates() for zi in zf.infolist(): ext = zi.filename.rpartition('.')[-1].lower() if ext in {'opf'}: try: raw = zf.open(zi) opf = OPF(raw) mi = opf.to_book_metadata() casanova_id = self.extract_id(mi) if casanova_id: book_mi = self.get_casanova_metadata(casanova_id['id']) if book_mi: # Update an existing book's metadata! result['updated'] = result['updated'] + 1 self.apply_metadata_update(casanova_id['id'], book_mi, mi) else: # Create a new book entry result['added'] = result['added'] + 1 self.model.db.import_book(mi,[]) except: foo=False if ext in {'jpg', 'png', 'gif'}: # try and handle the cover casanova_id = zi.filename.partition('.')[0].lower() if casanova_id in self.book_map: book_id = self.book_map[casanova_id] raw = zf.open(zi) self.db.set_cover(book_id, raw) self.finish_applying_updates() return result
def get_metadata(stream): if isinstance(stream, bytes): stream = DummyFile(stream) root = parse_opf(stream) ver = parse_opf_version(root.get('version')) opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item()
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: with ZipFile(stream) as zf: opf_name = get_first_opf_name(zf) opf_stream = StringIO(zf.read(opf_name)) opf = OPF(opf_stream) mi = opf.to_book_metadata() if extract_cover: cover_href = opf.raster_cover if not cover_href: for meta in opf.metadata.xpath('//*[local-name()="meta" and @name="cover"]'): val = meta.get('content') if val.rpartition('.')[2].lower() in {'jpeg', 'jpg', 'png'}: cover_href = val break if cover_href: try: mi.cover_data = (os.path.splitext(cover_href)[1], zf.read(cover_href)) except Exception: pass except Exception: return mi return mi
def get_metadata(stream): from calibre.ebooks.lit.reader import LitContainer from calibre.utils.logging import Log litfile = LitContainer(stream, Log()) src = litfile.get_metadata().encode('utf-8') litfile = litfile._litfile opf = OPF(cStringIO.StringIO(src), os.getcwdu()) mi = opf.to_book_metadata() covers = [] for item in opf.iterguide(): if 'cover' not in item.get('type', '').lower(): continue ctype = item.get('type') href = item.get('href', '') candidates = [href, href.replace('&', '%26')] for item in litfile.manifest.values(): if item.path in candidates: try: covers.append((litfile.get_file('/data/'+item.internal), ctype)) except: pass break covers.sort(cmp=lambda x, y:cmp(len(x[0]), len(y[0])), reverse=True) idx = 0 if len(covers) > 1: if covers[1][1] == covers[0][1]+'-standard': idx = 1 mi.cover_data = ('jpg', covers[idx][0]) return mi
def save_serialized_to_disk(ids, data, plugboards, root, opts, callback): from calibre.ebooks.metadata.opf2 import OPF root, opts, length = _sanitize_args(root, opts) failures = [] for x in ids: opf, cover, format_map, last_modified = data[x] if isinstance(opf, unicode): opf = opf.encode('utf-8') mi = OPF(cStringIO.StringIO(opf)).to_book_metadata() try: mi.last_modified = parse_date(last_modified) except: pass tb = '' try: with open(cover, 'rb') as f: cover = f.read() except: cover = None try: failed, id, title = do_save_book_to_disk(x, mi, cover, plugboards, format_map, root, opts, length) tb = _('Requested formats not available') except: failed, id, title = True, x, mi.title tb = traceback.format_exc() if failed: failures.append((id, title, tb)) if callable(callback): if not callback(int(id), title, failed, tb): break return failures
def process_dir(self, dirpath, filenames, book_id): book_id = int(book_id) formats = filter(self.is_ebook_file, filenames) fmts = [os.path.splitext(x)[1][1:].upper() for x in formats] sizes = [os.path.getsize(os.path.join(dirpath, x)) for x in formats] names = [os.path.splitext(x)[0] for x in formats] opf = os.path.join(dirpath, 'metadata.opf') mi = OPF(opf, basedir=dirpath).to_book_metadata() timestamp = os.path.getmtime(opf) path = os.path.relpath(dirpath, self.src_library_path).replace(os.sep, '/') if int(mi.application_id) == book_id: self.books.append({ 'mi': mi, 'timestamp': timestamp, 'formats': list(zip(fmts, sizes, names)), 'id': book_id, 'dirpath': dirpath, 'path': path, }) else: self.mismatched_dirs.append(dirpath) alm = mi.get('author_link_map', {}) for author, link in alm.iteritems(): existing_link, timestamp = self.authors_links.get(author, (None, None)) if existing_link is None or existing_link != link and timestamp < mi.timestamp: self.authors_links[author] = (link, mi.timestamp)
def process_result(self, group_id, result): if result.err: mi = self.report_metadata_failure(group_id, result.traceback) paths = self.file_groups[group_id] has_cover = False duplicate_info = set() if self.add_formats_to_existing else False else: paths, opf, has_cover, duplicate_info = result.value try: mi = OPF(BytesIO(opf), basedir=self.tdir, populate_spine=False, try_to_guess_cover=False).to_book_metadata() mi.read_metadata_failed = False except Exception: mi = self.report_metadata_failure(group_id, traceback.format_exc()) if mi.is_null('title'): for path in paths: mi.title = os.path.splitext(os.path.basename(path))[0] break if mi.application_id == '__calibre_dummy__': mi.application_id = None if gprefs.get('tag_map_on_add_rules'): from calibre.ebooks.metadata.tag_mapper import map_tags mi.tags = map_tags(mi.tags, gprefs['tag_map_on_add_rules']) if self.author_map_rules: from calibre.ebooks.metadata.author_mapper import map_authors new_authors = map_authors(mi.authors, self.author_map_rules) if new_authors != mi.authors: mi.authors = new_authors if self.db is None: mi.author_sort = authors_to_sort_string(mi.authors) else: mi.author_sort = self.db.author_sort_from_authors(mi.authors) self.pd.msg = mi.title cover_path = os.path.join(self.tdir, '%s.cdata' % group_id) if has_cover else None if self.db is None: if paths: self.items.append((mi, cover_path, paths)) return if self.add_formats_to_existing: identical_book_ids = find_identical_books(mi, self.find_identical_books_data) if identical_book_ids: try: self.merge_books(mi, cover_path, paths, identical_book_ids) except Exception: a = self.report.append a(''), a('-' * 70) a(_('Failed to merge the book: ') + mi.title) [a('\t' + f) for f in paths] a(_('With error:')), a(traceback.format_exc()) else: self.add_book(mi, cover_path, paths) else: if duplicate_info or icu_lower(mi.title or _('Unknown')) in self.added_duplicate_info: self.duplicates.append((mi, cover_path, paths)) else: self.add_book(mi, cover_path, paths)
def update_metadata(ebook, new_opf): from calibre.ebooks.metadata.opf2 import OPF from calibre.ebooks.metadata.epub import update_metadata opfpath = ebook.name_to_abspath(ebook.opf_name) with ebook.open(ebook.opf_name, "r+b") as stream, open(new_opf, "rb") as ns: opf = OPF(stream, basedir=os.path.dirname(opfpath), populate_spine=False, unquote_urls=False) mi = OPF(ns, unquote_urls=False, populate_spine=False).to_book_metadata() mi.cover, mi.cover_data = None, (None, None) update_metadata(opf, mi, apply_null=True, update_timestamp=True) stream.seek(0) stream.truncate() stream.write(opf.render())
def read_serialized_metadata(book_id, data): from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.date import parse_date mi = OPF(data['opf'], try_to_guess_cover=False, populate_spine=False, basedir=os.path.dirname(data['opf'])).to_book_metadata() try: mi.last_modified = parse_date(data['last_modified']) except: pass mi.cover, mi.cover_data = None, (None, None) cdata = None if 'cover' in data: with lopen(data['cover'], 'rb') as f: cdata = f.read() return mi, cdata
def main(do_identify, covers, metadata, ensure_fields, tdir): failed_ids = set() failed_covers = set() all_failed = True log = GUILog() patch_plugins() for book_id, mi in metadata.iteritems(): mi = OPF(BytesIO(mi), basedir=tdir, populate_spine=False).to_book_metadata() title, authors, identifiers = mi.title, mi.authors, mi.identifiers cdata = None log.clear() if do_identify: results = [] try: results = identify(log, Event(), title=title, authors=authors, identifiers=identifiers) except: pass if results: all_failed = False mi = merge_result(mi, results[0], ensure_fields=ensure_fields) identifiers = mi.identifiers if not mi.is_null('rating'): # set_metadata expects a rating out of 10 mi.rating *= 2 with open(os.path.join(tdir, '%d.mi'%book_id), 'wb') as f: f.write(metadata_to_opf(mi, default_lang='und')) else: log.error('Failed to download metadata for', title) failed_ids.add(book_id) if covers: cdata = download_cover(log, title=title, authors=authors, identifiers=identifiers) if cdata is None: failed_covers.add(book_id) else: with open(os.path.join(tdir, '%d.cover'%book_id), 'wb') as f: f.write(cdata[-1]) all_failed = False with open(os.path.join(tdir, '%d.log'%book_id), 'wb') as f: f.write(log.plain_text.encode('utf-8')) return failed_ids, failed_covers, all_failed
def get_metadata(stream, extract_cover=True): ''' Return metadata as a L{MetaInfo} object ''' mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) try: with ZipFile(stream) as zf: opf_name = get_first_opf_name(zf) opf_stream = StringIO(zf.read(opf_name)) opf = OPF(opf_stream) mi = opf.to_book_metadata() if extract_cover: cover_href = opf.raster_cover if cover_href: mi.cover_data = (os.path.splitext(cover_href)[1], zf.read(cover_href)) except: return mi return mi
def opf_metadata(opfpath): if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', os.getcwdu()) else: f = open(opfpath, 'rb') try: opf = OPF(f, os.path.dirname(opfpath)) if opf.application_id is not None: mi = opf.to_book_metadata() if hasattr(opf, 'cover') and opf.cover: cpath = os.path.join(os.path.dirname(opfpath), opf.cover) if os.access(cpath, os.R_OK): fmt = cpath.rpartition('.')[-1] data = open(cpath, 'rb').read() mi.cover_data = (fmt, data) return mi except: import traceback traceback.print_exc() pass
def set_metadata(stream, mi): replacements = {} # Get the OPF in the archive. with ZipFile(stream) as zf: opf_path = get_first_opf_name(zf) opf_stream = StringIO(zf.read(opf_path)) opf = OPF(opf_stream) # Cover. new_cdata = None try: new_cdata = mi.cover_data[1] if not new_cdata: raise Exception('no cover') except: try: new_cdata = open(mi.cover, 'rb').read() except: pass if new_cdata: cpath = opf.raster_cover if not cpath: cpath = 'cover.jpg' new_cover = _write_new_cover(new_cdata, cpath) replacements[cpath] = open(new_cover.name, 'rb') mi.cover = cpath # Update the metadata. opf.smart_update(mi, replace_metadata=True) newopf = StringIO(opf.render()) safe_replace(stream, opf_path, newopf, extra_replacements=replacements, add_missing=True) # Cleanup temporary files. try: if cpath is not None: replacements[cpath].close() os.remove(replacements[cpath].name) except: pass
def zip_opf_metadata(opfpath, zf): from calibre.ebooks.metadata.opf2 import OPF if hasattr(opfpath, 'read'): f = opfpath opfpath = getattr(f, 'name', getcwd()) else: f = open(opfpath, 'rb') opf = OPF(f, os.path.dirname(opfpath)) mi = opf.to_book_metadata() # This is broken, in that it only works for # when both the OPF file and the cover file are in the root of the # zip file and the cover is an actual raster image, but I don't care # enough to make it more robust if getattr(mi, 'cover', None): covername = os.path.basename(mi.cover) mi.cover = None names = zf.namelist() if covername in names: fmt = covername.rpartition('.')[-1] data = zf.read(covername) mi.cover_data = (fmt, data) return mi
def paste_metadata(self): rows = self.gui.library_view.selectionModel().selectedRows() if not rows or len(rows) == 0: return error_dialog(self.gui, _('Cannot paste metadata'), _('No books selected'), show=True) c = QApplication.clipboard() md = c.mimeData() if not md.hasFormat('application/calibre-book-metadata'): return error_dialog(self.gui, _('Cannot paste metadata'), _('No copied metadata available'), show=True) if len(rows) > 1: if not confirm(_( 'You are pasting metadata onto <b>multiple books</b> ({num_of_books}). Are you' ' sure you want to do that?').format(num_of_books=len(rows)), 'paste-onto-multiple', parent=self.gui): return data = bytes(md.data('application/calibre-book-metadata')) mi = OPF(BytesIO(data), populate_spine=False, read_toc=False, try_to_guess_cover=False).to_book_metadata() mi.application_id = mi.uuid_id = None exclude = set(tweaks['exclude_fields_on_paste']) paste_cover = 'cover' not in exclude cover = md.imageData() if paste_cover else None exclude.discard('cover') for field in exclude: mi.set_null(field) db = self.gui.current_db book_ids = {db.id(r.row()) for r in rows} title_excluded = 'title' in exclude authors_excluded = 'authors' in exclude for book_id in book_ids: if title_excluded: mi.title = db.new_api.field_for('title', book_id) if authors_excluded: mi.authors = db.new_api.field_for('authors', book_id) db.new_api.set_metadata(book_id, mi, ignore_errors=True) if cover: db.new_api.set_cover({book_id: cover for book_id in book_ids}) self.refresh_books_after_metadata_edit(book_ids)
def read_user_metadata(self): ''' Read all metadata specified by the user. Command line options override metadata from a specified OPF file. ''' from calibre.ebooks.metadata import MetaInformation from calibre.ebooks.metadata.opf2 import OPF mi = MetaInformation(None, []) if self.opts.read_metadata_from_opf is not None: self.opts.read_metadata_from_opf = os.path.abspath( self.opts.read_metadata_from_opf) opf = OPF(open(self.opts.read_metadata_from_opf, 'rb'), os.path.dirname(self.opts.read_metadata_from_opf)) mi = opf.to_book_metadata() self.opts_to_mi(mi) if mi.cover: if mi.cover.startswith('http:') or mi.cover.startswith('https:'): mi.cover = self.download_cover(mi.cover) ext = mi.cover.rpartition('.')[-1].lower().strip() if ext not in ('png', 'jpg', 'jpeg', 'gif'): ext = 'jpg' mi.cover_data = (ext, open(mi.cover, 'rb').read()) mi.cover = None self.user_metadata = mi
def process_result(self, group_id, result): if result.err: mi = self.report_metadata_failure(group_id, result.traceback) paths = self.file_groups[group_id] has_cover = False duplicate_info = set() if self.add_formats_to_existing else False else: paths, opf, has_cover, duplicate_info = result.value try: mi = OPF( BytesIO(opf), basedir=self.tdir, populate_spine=False, try_to_guess_cover=False ).to_book_metadata() mi.read_metadata_failed = False except Exception: mi = self.report_metadata_failure(group_id, traceback.format_exc()) if mi.is_null("title"): for path in paths: mi.title = os.path.splitext(os.path.basename(path))[0] break if mi.application_id == "__calibre_dummy__": mi.application_id = None if gprefs.get("tag_map_on_add_rules"): from calibre.ebooks.metadata.tag_mapper import map_tags mi.tags = map_tags(mi.tags, gprefs["tag_map_on_add_rules"]) self.pd.msg = mi.title cover_path = os.path.join(self.tdir, "%s.cdata" % group_id) if has_cover else None if self.db is None: if paths: self.items.append((mi, cover_path, paths)) return if self.add_formats_to_existing: identical_book_ids = find_identical_books(mi, self.find_identical_books_data) if identical_book_ids: try: self.merge_books(mi, cover_path, paths, identical_book_ids) except Exception: a = self.report.append a(""), a("-" * 70) a(_("Failed to merge the book: ") + mi.title) [a("\t" + f) for f in paths] a(_("With error:")), a(traceback.format_exc()) else: self.add_book(mi, cover_path, paths) else: if duplicate_info or icu_lower(mi.title or _("Unknown")) in self.added_duplicate_info: self.duplicates.append((mi, cover_path, paths)) else: self.add_book(mi, cover_path, paths)
def get_metadata(book_id): oldmi = db.get_metadata(book_id, index_is_id=True, get_cover=True, cover_as_data=True) opf, cov = id_map[book_id] if opf is None: newmi = Metadata(oldmi.title, authors=tuple(oldmi.authors)) else: with open(opf, 'rb') as f: newmi = OPF(f, basedir=os.path.dirname(opf), populate_spine=False).to_book_metadata() newmi.cover, newmi.cover_data = None, (None, None) for x in ('title', 'authors'): if newmi.is_null(x): # Title and author are set to null if they are # the same as the originals as an optimization, # we undo that, as it is confusing. newmi.set(x, copy.copy(oldmi.get(x))) if cov: with open(cov, 'rb') as f: newmi.cover_data = ('jpg', f.read()) return oldmi, newmi
def test_backup(self): # {{{ 'Test the automatic backup of changed metadata' cl = self.cloned_library cache = self.init_cache(cl) ae, af, sf = self.assertEqual, self.assertFalse, cache.set_field # First empty dirtied cache.dump_metadata() af(cache.dirtied_cache) from calibre.db.backup import MetadataBackup interval = 0.01 mb = MetadataBackup(cache, interval=interval, scheduling_interval=0) mb.start() try: ae(sf('title', {1: 'title1', 2: 'title2', 3: 'title3'}), {1, 2, 3}) ae( sf( 'authors', { 1: 'author1 & author2', 2: 'author1 & author2', 3: 'author1 & author2' }), {1, 2, 3}) count = 6 while cache.dirty_queue_length() and count > 0: mb.join(2) count -= 1 af(cache.dirty_queue_length()) finally: mb.stop() mb.join(2) af(mb.is_alive()) from calibre.ebooks.metadata.opf2 import OPF for book_id in (1, 2, 3): raw = cache.read_backup(book_id) opf = OPF(BytesIO(raw)) ae(opf.title, 'title%d' % book_id) ae(opf.authors, ['author1', 'author2'])
def __init__(self): try: mimetype = self.open('mimetype').read().rstrip() if mimetype != OCF.MIMETYPE: print 'WARNING: Invalid mimetype declaration', mimetype except: print 'WARNING: Epub doesn\'t contain a mimetype declaration' try: with closing(self.open(OCF.CONTAINER_PATH)) as f: self.container = Container(f) except KeyError: raise EPubException("missing OCF container.xml file") self.opf_path = self.container[OPF.MIMETYPE] try: with closing(self.open(self.opf_path)) as f: self.opf = OPF(f, self.root, populate_spine=False) except KeyError: raise EPubException("missing OPF package file") try: with closing(self.open(self.ENCRYPTION_PATH)) as f: self.encryption_meta = Encryption(f.read()) except: self.encryption_meta = Encryption(None)
def paste_metadata(self): rows = self.gui.library_view.selectionModel().selectedRows() if not rows or len(rows) == 0: return error_dialog(self.gui, _('Cannot paste metadata'), _('No books selected'), show=True) c = QApplication.clipboard() md = c.mimeData() if not md.hasFormat('application/calibre-book-metadata'): return error_dialog(self.gui, _('Cannot paste metadata'), _('No copied metadata available'), show=True) if len(rows) > 1: if not confirm(_( 'You are pasting metadata onto <b>multiple books</b> ({num_of_books}). Are you' ' sure you want to do that?').format( num_of_books=len(rows)), 'paste-onto-multiple', parent=self.gui): return data = bytes(md.data('application/calibre-book-metadata')) mi = OPF(BytesIO(data), populate_spine=False, read_toc=False, try_to_guess_cover=False).to_book_metadata() mi.application_id = mi.uuid_id = None cover = md.imageData() db = self.gui.current_db book_ids = {db.id(r.row()) for r in rows} for book_id in book_ids: db.new_api.set_metadata(book_id, mi, ignore_errors=True) if cover: db.new_api.set_cover({book_id: cover for book_id in book_ids}) self.refresh_books_after_metadata_edit(book_ids)
def convert_text(self, oeb_book): from calibre.ebooks.metadata.opf2 import OPF if self.opts.old_pdf_engine: from calibre.ebooks.pdf.writer import PDFWriter PDFWriter else: from calibre.ebooks.pdf.render.from_html import PDFWriter self.log.debug('Serializing oeb input to disk for processing...') self.get_cover_data() self.handle_embedded_fonts() with TemporaryDirectory('_pdf_out') as oeb_dir: from calibre.customize.ui import plugin_for_output_format oeb_output = plugin_for_output_format('oeb') oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log) opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0] opf = OPF(opfpath, os.path.dirname(opfpath)) self.write(PDFWriter, [s.path for s in opf.spine], getattr(opf, 'toc', None))
def set_metadata_opf2(root, cover_prefix, mi, opf_version, cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): mi = MetaInformation(mi) for x in ('guide', 'toc', 'manifest', 'spine'): setattr(mi, x, None) opf = OPF(None, preparsed_opf=root, read_toc=False) if mi.languages: mi.languages = normalize_languages(list(opf.raw_languages) or [], mi.languages) opf.smart_update(mi, apply_null=apply_null) if getattr(mi, 'uuid', None): opf.application_id = mi.uuid if apply_null or force_identifiers: opf.set_identifiers(mi.get_identifiers()) else: orig = opf.get_identifiers() orig.update(mi.get_identifiers()) opf.set_identifiers({k:v for k, v in orig.iteritems() if k and v}) if update_timestamp and mi.timestamp is not None: opf.timestamp = mi.timestamp raster_cover = opf.raster_cover if raster_cover is None and cover_data is not None and add_missing_cover: guide_raster_cover = opf.guide_raster_cover i = None if guide_raster_cover is not None: i = guide_raster_cover raster_cover = i.get('href') else: if cover_prefix and not cover_prefix.endswith('/'): cover_prefix += '/' name = cover_prefix + 'cover.jpg' i = create_manifest_item(opf.root, name, 'cover') if i is not None: raster_cover = name if i is not None: if opf_version.major < 3: [x.getparent().remove(x) for x in opf.root.xpath('//*[local-name()="meta" and @name="cover"]')] m = opf.create_metadata_element('meta', is_dc=False) m.set('name', 'cover'), m.set('content', i.get('id')) else: for x in opf.root.xpath('//*[local-name()="item" and contains(@properties, "cover-image")]'): x.set('properties', x.get('properties').replace('cover-image', '').strip()) i.set('properties', 'cover-image') with pretty_print: return opf.render(), raster_cover
def test_against_opf2(self): # {{{ # opf2 {{{ raw = '''<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> <dc:identifier opf:scheme="calibre" id="calibre_id">1698</dc:identifier> <dc:identifier opf:scheme="uuid" id="uuid_id">27106d11-0721-44bc-bcdd-2840f31aaec0</dc:identifier> <dc:title>DOCX Demo</dc:title> <dc:creator opf:file-as="Goyal, Kovid" opf:role="aut">Kovid Goyal</dc:creator> <dc:contributor opf:file-as="calibre" opf:role="bkp">calibre (2.57.1) [http://calibre-ebook.com]</dc:contributor> <dc:date>2016-02-17T10:53:08+00:00</dc:date> <dc:description>Demonstration of DOCX support in calibre</dc:description> <dc:publisher>Kovid Goyal</dc:publisher> <dc:identifier opf:scheme="K">xxx</dc:identifier> <dc:language>eng</dc:language> <dc:subject>calibre</dc:subject> <dc:subject>conversion</dc:subject> <dc:subject>docs</dc:subject> <dc:subject>ebook</dc:subject> <meta content="{"Kovid Goyal": ""}" name="calibre:author_link_map"/> <meta content="Demos" name="calibre:series"/> <meta content="1" name="calibre:series_index"/> <meta content="10" name="calibre:rating"/> <meta content="2015-12-11T16:28:36+00:00" name="calibre:timestamp"/> <meta content="DOCX Demo" name="calibre:title_sort"/> <meta content="{"crew.crow": [], "crew.moose": [], "crew": []}" name="calibre:user_categories"/> <meta name="calibre:user_metadata:#number" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Number", "rec_index": 29, "#extra#": null, "colnum": 12, "is_multiple2": {}, "category_sort": "value", "display": {"number_format": null}, "search_terms": ["#number"], "is_editable": true, "datatype": "int", "link_column": "value", "#value#": 31, "is_custom": true, "label": "number", "table": "custom_column_12", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#genre" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Genre", "rec_index": 26, "#extra#": null, "colnum": 9, "is_multiple2": {}, "category_sort": "value", "display": {"use_decorations": 0}, "search_terms": ["#genre"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": "Demos", "is_custom": true, "label": "genre", "table": "custom_column_9", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#commetns" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Comments", "rec_index": 23, "#extra#": null, "colnum": 13, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#commetns"], "is_editable": true, "datatype": "comments", "link_column": "value", "#value#": "<div><b><i>Testing</i></b> extra <font color=\"#aa0000\">comments</font></div>", "is_custom": true, "label": "commetns", "table": "custom_column_13", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#formats" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Formats", "rec_index": 25, "#extra#": null, "colnum": 4, "is_multiple2": {}, "category_sort": "value", "display": {"composite_template": "{formats}", "contains_html": false, "use_decorations": 0, "composite_sort": "text", "make_category": false}, "search_terms": ["#formats"], "is_editable": true, "datatype": "composite", "link_column": "value", "#value#": "AZW3, DOCX, EPUB", "is_custom": true, "label": "formats", "table": "custom_column_4", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#rating" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Rating", "rec_index": 30, "#extra#": null, "colnum": 1, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#rating"], "is_editable": true, "datatype": "rating", "link_column": "value", "#value#": 10, "is_custom": true, "label": "rating", "table": "custom_column_1", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#series" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Series2", "rec_index": 31, "#extra#": 1.0, "colnum": 5, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#series"], "is_editable": true, "datatype": "series", "link_column": "value", "#value#": "s", "is_custom": true, "label": "series", "table": "custom_column_5", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#tags" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Tags", "rec_index": 33, "#extra#": null, "colnum": 11, "is_multiple2": {"ui_to_list": ",", "cache_to_list": "|", "list_to_ui": ", "}, "category_sort": "value", "display": {"is_names": false, "description": "A tag like column for me"}, "search_terms": ["#tags"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": ["t1", "t2"], "is_custom": true, "label": "tags", "table": "custom_column_11", "is_multiple": "|", "is_category": true}"/> <meta name="calibre:user_metadata:#yesno" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Yes/No", "rec_index": 34, "#extra#": null, "colnum": 7, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#yesno"], "is_editable": true, "datatype": "bool", "link_column": "value", "#value#": false, "is_custom": true, "label": "yesno", "table": "custom_column_7", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#myenum" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Enum", "rec_index": 28, "#extra#": null, "colnum": 6, "is_multiple2": {}, "category_sort": "value", "display": {"enum_colors": [], "enum_values": ["One", "Two", "Three"], "use_decorations": 0}, "search_terms": ["#myenum"], "is_editable": true, "datatype": "enumeration", "link_column": "value", "#value#": "Two", "is_custom": true, "label": "myenum", "table": "custom_column_6", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#isbn" content="{"kind": "field", "column": "value", "is_csp": false, "name": "ISBN", "rec_index": 27, "#extra#": null, "colnum": 3, "is_multiple2": {}, "category_sort": "value", "display": {"composite_template": "{identifiers:select(isbn)}", "contains_html": false, "use_decorations": 0, "composite_sort": "text", "make_category": false}, "search_terms": ["#isbn"], "is_editable": true, "datatype": "composite", "link_column": "value", "#value#": "", "is_custom": true, "label": "isbn", "table": "custom_column_3", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#authors" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Authors", "rec_index": 22, "#extra#": null, "colnum": 10, "is_multiple2": {"ui_to_list": "&", "cache_to_list": "|", "list_to_ui": " & "}, "category_sort": "value", "display": {"is_names": true}, "search_terms": ["#authors"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": ["calibre, Kovid Goyal"], "is_custom": true, "label": "authors", "table": "custom_column_10", "is_multiple": "|", "is_category": true}"/> <meta name="calibre:user_metadata:#date" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Date", "rec_index": 24, "#extra#": null, "colnum": 2, "is_multiple2": {}, "category_sort": "value", "display": {"date_format": "dd-MM-yyyy", "description": ""}, "search_terms": ["#date"], "is_editable": true, "datatype": "datetime", "link_column": "value", "#value#": {"__value__": "2016-02-17T10:54:15+00:00", "__class__": "datetime.datetime"}, "is_custom": true, "label": "date", "table": "custom_column_2", "is_multiple": null, "is_category": false}"/> </metadata><manifest><item href="start.html" media-type="text/html" id="m1"/></manifest><spine><itemref idref="m1"/></spine> </package>''' # }}} def compare_metadata(mi2, mi3): self.ae(mi2.get_all_user_metadata(False), mi3.get_all_user_metadata(False)) for field in ALL_METADATA_FIELDS: if field not in 'manifest spine': v2, v3 = getattr(mi2, field, None), getattr(mi3, field, None) self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3)) mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata() root = etree.fromstring(raw) root.set('version', '3.0') mi3, _, raster_cover, first_spine_item = read_metadata(root, return_extra_data=True) self.assertIsNone(raster_cover) self.ae('start.html', first_spine_item) compare_metadata(mi2, mi3) apply_metadata(root, mi3, force_identifiers=True) nmi = read_metadata(root) compare_metadata(mi3, nmi) mi3.tags = [] mi3.set('#tags', []) mi3.set('#number', 0) mi3.set('#commetns', '') apply_metadata(root, mi3, update_timestamp=True) self.assertFalse(root.xpath('//*/@name')) nmi = read_metadata(root) self.assertEqual(mi2.tags, nmi.tags) self.assertEqual(mi2.get('#tags'), nmi.get('#tags')) self.assertEqual(mi2.get('#commetns'), nmi.get('#commetns')) self.assertEqual(0, nmi.get('#number')) apply_metadata(root, mi3, apply_null=True) nmi = read_metadata(root) self.assertFalse(nmi.tags) self.assertFalse(nmi.get('#tags')) self.assertFalse(nmi.get('#commetns')) self.assertIsNone(apply_metadata(root, mi3, cover_data=b'x', cover_prefix='xxx', add_missing_cover=False)) self.ae('xxx/cover.jpg', apply_metadata(root, mi3, cover_data=b'x', cover_prefix='xxx'))
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename( unicode_type(oeb_book.metadata.title[0])), ))[0] with open(os.path.join(tdir, fname + u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode_type( etree.tostring(item.data, encoding=unicode_type)) else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF( io.BytesIO( etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() zf.extractall(cwd) except Exception: log.exception("KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk("."): if (f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".")): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)"). format(path)) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() opf = os.path.relpath(opf, cwd) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = (self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported" ) # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath("content.opf")
def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz'}: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self
def test_annotations(self): # {{{ 'Test handling of annotations' from calibre.utils.date import utcnow, EPOCH cl = self.cloned_library cache = self.init_cache(cl) # First empty dirtied cache.dump_metadata() self.assertFalse(cache.dirtied_cache) def a(**kw): ts = utcnow() kw['timestamp'] = utcnow().isoformat() return kw, (ts - EPOCH).total_seconds() annot_list = [ a(type='bookmark', title='bookmark1 changed', seq=1), a(type='highlight', highlighted_text='text1', uuid='1', seq=2), a(type='highlight', highlighted_text='text2', uuid='2', seq=3, notes='notes2 some word changed again'), ] def map_as_list(amap): ans = [] for items in amap.values(): ans.extend(items) ans.sort(key=lambda x:x['seq']) return ans cache.set_annotations_for_book(1, 'moo', annot_list) amap = cache.annotations_map_for_book(1, 'moo') self.assertEqual(3, len(cache.all_annotations_for_book(1))) self.assertEqual([x[0] for x in annot_list], map_as_list(amap)) self.assertFalse(cache.dirtied_cache) cache.check_dirtied_annotations() self.assertEqual(set(cache.dirtied_cache), {1}) cache.dump_metadata() cache.check_dirtied_annotations() self.assertFalse(cache.dirtied_cache) # Test searching results = cache.search_annotations('"changed"') self.assertEqual([1, 3], [x['id'] for x in results]) results = cache.search_annotations('"changed"', annotation_type='bookmark') self.assertEqual([1], [x['id'] for x in results]) results = cache.search_annotations('"Changed"') # changed and change stem differently in english and other euro languages self.assertEqual([1, 3], [x['id'] for x in results]) results = cache.search_annotations('"SOMe"') self.assertEqual([3], [x['id'] for x in results]) results = cache.search_annotations('"change"', use_stemming=False) self.assertFalse(results) results = cache.search_annotations('"bookmark1"', highlight_start='[', highlight_end=']') self.assertEqual(results[0]['text'], '[bookmark1] changed') results = cache.search_annotations('"word"', highlight_start='[', highlight_end=']', snippet_size=3) self.assertEqual(results[0]['text'], '…some [word] changed…') self.assertRaises(FTSQueryError, cache.search_annotations, 'AND OR') fts_l = [a(type='bookmark', title='路坎坷走来', seq=1),] cache.set_annotations_for_book(1, 'moo', fts_l) results = cache.search_annotations('路', highlight_start='[', highlight_end=']') self.assertEqual(results[0]['text'], '[路]坎坷走来') annot_list[0][0]['title'] = 'changed title' cache.set_annotations_for_book(1, 'moo', annot_list) amap = cache.annotations_map_for_book(1, 'moo') self.assertEqual([x[0] for x in annot_list], map_as_list(amap)) del annot_list[1] cache.set_annotations_for_book(1, 'moo', annot_list) amap = cache.annotations_map_for_book(1, 'moo') self.assertEqual([x[0] for x in annot_list], map_as_list(amap)) cache.check_dirtied_annotations() cache.dump_metadata() from calibre.ebooks.metadata.opf2 import OPF raw = cache.read_backup(1) opf = OPF(BytesIO(raw)) cache.restore_annotations(1, list(opf.read_annotations())) amap = cache.annotations_map_for_book(1, 'moo') self.assertEqual([x[0] for x in annot_list], map_as_list(amap))
def test_against_opf2(self): # {{{ # opf2 {{{ raw = '''<package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> <dc:identifier opf:scheme="calibre" id="calibre_id">1698</dc:identifier> <dc:identifier opf:scheme="uuid" id="uuid_id">27106d11-0721-44bc-bcdd-2840f31aaec0</dc:identifier> <dc:title>DOCX Demo</dc:title> <dc:creator opf:file-as="Goyal, Kovid" opf:role="aut">Kovid Goyal</dc:creator> <dc:contributor opf:file-as="calibre" opf:role="bkp">calibre (2.57.1) [http://calibre-ebook.com]</dc:contributor> <dc:date>2016-02-17T10:53:08+00:00</dc:date> <dc:description>Demonstration of DOCX support in calibre</dc:description> <dc:publisher>Kovid Goyal</dc:publisher> <dc:identifier opf:scheme="K">xxx</dc:identifier> <dc:language>eng</dc:language> <dc:subject>calibre</dc:subject> <dc:subject>conversion</dc:subject> <dc:subject>docs</dc:subject> <dc:subject>ebook</dc:subject> <meta content="{"Kovid Goyal": ""}" name="calibre:author_link_map"/> <meta content="Demos" name="calibre:series"/> <meta content="1" name="calibre:series_index"/> <meta content="10" name="calibre:rating"/> <meta content="2015-12-11T16:28:36+00:00" name="calibre:timestamp"/> <meta content="DOCX Demo" name="calibre:title_sort"/> <meta content="{"crew.crow": [], "crew.moose": [], "crew": []}" name="calibre:user_categories"/> <meta name="calibre:user_metadata:#number" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Number", "rec_index": 29, "#extra#": null, "colnum": 12, "is_multiple2": {}, "category_sort": "value", "display": {"number_format": null}, "search_terms": ["#number"], "is_editable": true, "datatype": "int", "link_column": "value", "#value#": 31, "is_custom": true, "label": "number", "table": "custom_column_12", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#genre" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Genre", "rec_index": 26, "#extra#": null, "colnum": 9, "is_multiple2": {}, "category_sort": "value", "display": {"use_decorations": 0}, "search_terms": ["#genre"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": "Demos", "is_custom": true, "label": "genre", "table": "custom_column_9", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#commetns" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Comments", "rec_index": 23, "#extra#": null, "colnum": 13, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#commetns"], "is_editable": true, "datatype": "comments", "link_column": "value", "#value#": "<div><b><i>Testing</i></b> extra <font color=\"#aa0000\">comments</font></div>", "is_custom": true, "label": "commetns", "table": "custom_column_13", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#formats" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Formats", "rec_index": 25, "#extra#": null, "colnum": 4, "is_multiple2": {}, "category_sort": "value", "display": {"composite_template": "{formats}", "contains_html": false, "use_decorations": 0, "composite_sort": "text", "make_category": false}, "search_terms": ["#formats"], "is_editable": true, "datatype": "composite", "link_column": "value", "#value#": "AZW3, DOCX, EPUB", "is_custom": true, "label": "formats", "table": "custom_column_4", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#rating" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Rating", "rec_index": 30, "#extra#": null, "colnum": 1, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#rating"], "is_editable": true, "datatype": "rating", "link_column": "value", "#value#": 10, "is_custom": true, "label": "rating", "table": "custom_column_1", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#series" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Series2", "rec_index": 31, "#extra#": 1.0, "colnum": 5, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#series"], "is_editable": true, "datatype": "series", "link_column": "value", "#value#": "s", "is_custom": true, "label": "series", "table": "custom_column_5", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#tags" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Tags", "rec_index": 33, "#extra#": null, "colnum": 11, "is_multiple2": {"ui_to_list": ",", "cache_to_list": "|", "list_to_ui": ", "}, "category_sort": "value", "display": {"is_names": false, "description": "A tag like column for me"}, "search_terms": ["#tags"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": ["t1", "t2"], "is_custom": true, "label": "tags", "table": "custom_column_11", "is_multiple": "|", "is_category": true}"/> <meta name="calibre:user_metadata:#yesno" content="{"kind": "field", "column": "value", "is_csp": false, "name": "Yes/No", "rec_index": 34, "#extra#": null, "colnum": 7, "is_multiple2": {}, "category_sort": "value", "display": {}, "search_terms": ["#yesno"], "is_editable": true, "datatype": "bool", "link_column": "value", "#value#": false, "is_custom": true, "label": "yesno", "table": "custom_column_7", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#myenum" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Enum", "rec_index": 28, "#extra#": null, "colnum": 6, "is_multiple2": {}, "category_sort": "value", "display": {"enum_colors": [], "enum_values": ["One", "Two", "Three"], "use_decorations": 0}, "search_terms": ["#myenum"], "is_editable": true, "datatype": "enumeration", "link_column": "value", "#value#": "Two", "is_custom": true, "label": "myenum", "table": "custom_column_6", "is_multiple": null, "is_category": true}"/> <meta name="calibre:user_metadata:#isbn" content="{"kind": "field", "column": "value", "is_csp": false, "name": "ISBN", "rec_index": 27, "#extra#": null, "colnum": 3, "is_multiple2": {}, "category_sort": "value", "display": {"composite_template": "{identifiers:select(isbn)}", "contains_html": false, "use_decorations": 0, "composite_sort": "text", "make_category": false}, "search_terms": ["#isbn"], "is_editable": true, "datatype": "composite", "link_column": "value", "#value#": "", "is_custom": true, "label": "isbn", "table": "custom_column_3", "is_multiple": null, "is_category": false}"/> <meta name="calibre:user_metadata:#authors" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Authors", "rec_index": 22, "#extra#": null, "colnum": 10, "is_multiple2": {"ui_to_list": "&", "cache_to_list": "|", "list_to_ui": " & "}, "category_sort": "value", "display": {"is_names": true}, "search_terms": ["#authors"], "is_editable": true, "datatype": "text", "link_column": "value", "#value#": ["calibre, Kovid Goyal"], "is_custom": true, "label": "authors", "table": "custom_column_10", "is_multiple": "|", "is_category": true}"/> <meta name="calibre:user_metadata:#date" content="{"kind": "field", "column": "value", "is_csp": false, "name": "My Date", "rec_index": 24, "#extra#": null, "colnum": 2, "is_multiple2": {}, "category_sort": "value", "display": {"date_format": "dd-MM-yyyy", "description": ""}, "search_terms": ["#date"], "is_editable": true, "datatype": "datetime", "link_column": "value", "#value#": {"__value__": "2016-02-17T10:54:15+00:00", "__class__": "datetime.datetime"}, "is_custom": true, "label": "date", "table": "custom_column_2", "is_multiple": null, "is_category": false}"/> </metadata><manifest><item href="start.html" media-type="text/html" id="m1"/></manifest><spine><itemref idref="m1"/></spine> </package>''' # }}} def compare_metadata(mi2, mi3): self.ae(mi2.get_all_user_metadata(False), mi3.get_all_user_metadata(False)) for field in ALL_METADATA_FIELDS: if field not in 'manifest spine': v2, v3 = getattr(mi2, field, None), getattr(mi3, field, None) self.ae(v2, v3, '%s: %r != %r' % (field, v2, v3)) mi2 = OPF(BytesIO(raw.encode('utf-8'))).to_book_metadata() root = etree.fromstring(raw) root.set('version', '3.0') mi3, _, raster_cover, first_spine_item = read_metadata( root, return_extra_data=True) self.assertIsNone(raster_cover) self.ae('start.html', first_spine_item) compare_metadata(mi2, mi3) apply_metadata(root, mi3, force_identifiers=True) nmi = read_metadata(root) compare_metadata(mi3, nmi) mi3.tags = [] mi3.set('#tags', []) mi3.set('#number', 0) mi3.set('#commetns', '') apply_metadata(root, mi3, update_timestamp=True) self.assertFalse(root.xpath('//*/@name')) nmi = read_metadata(root) self.assertEqual(mi2.tags, nmi.tags) self.assertEqual(mi2.get('#tags'), nmi.get('#tags')) self.assertEqual(mi2.get('#commetns'), nmi.get('#commetns')) self.assertEqual(0, nmi.get('#number')) apply_metadata(root, mi3, apply_null=True) nmi = read_metadata(root) self.assertFalse(nmi.tags) self.assertFalse(nmi.get('#tags')) self.assertFalse(nmi.get('#commetns')) self.assertIsNone( apply_metadata(root, mi3, cover_data=b'x', cover_prefix='xxx', add_missing_cover=False)) self.ae('xxx/cover.jpg', apply_metadata(root, mi3, cover_data=b'x', cover_prefix='xxx'))
def test_get(self): # {{{ 'Test /get' with self.create_server() as server: db = server.handler.router.ctx.library_broker.get(None) conn = server.connect() def get(what, book_id, library_id=None, q=''): q = ('?' + q) if q else q conn.request( 'GET', '/get/%s/%s' % (what, book_id) + (('/' + library_id) if library_id else '') + q) r = conn.getresponse() return r, r.read() # Test various invalid parameters def bad(*args): r, data = get(*args) self.ae(r.status, http_client.NOT_FOUND) bad('xxx', 1) bad('fmt1', 10) bad('fmt1', 1, 'zzzz') bad('fmt1', 'xx') # Test simple fetching of format without metadata update r, data = get('fmt1', 1, db.server_library_id) self.ae(data, db.format(1, 'fmt1')) self.assertIsNotNone(r.getheader('Content-Disposition')) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('fmt1', 1) self.ae(data, db.format(1, 'fmt1')) self.ae(r.getheader('Used-Cache'), 'yes') # Test fetching of format with metadata update raw = P('quick_start/eng.epub', data=True) r, data = get('epub', 1) self.ae(r.status, http_client.OK) etag = r.getheader('ETag') self.assertIsNotNone(etag) self.ae(r.getheader('Used-Cache'), 'no') self.assertTrue(data.startswith(b'PK')) self.assertGreaterEqual(len(data), len(raw)) db.set_field('title', {1: 'changed'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed') r, data = get('epub', 1) self.ae(r.getheader('Used-Cache'), 'yes') # Test plugboards import calibre.library.save_to_disk as c orig, c.DEBUG = c.DEBUG, False try: db.set_pref( 'plugboards', { u'epub': { u'content_server': [[u'changed, {title}', u'title']] } }) # this is needed as the cache is not invalidated for plugboard changes db.set_field('title', {1: 'again'}) r, data = get('epub', 1) self.assertNotEqual(r.getheader('ETag'), etag) etag = r.getheader('ETag') self.ae(r.getheader('Used-Cache'), 'no') mi = get_metadata(BytesIO(data), extract_cover=False) self.ae(mi.title, 'changed, again') finally: c.DEBUG = orig # Test the serving of covers def change_cover(count, book_id=2): cpath = db.format_abspath(book_id, '__COVER_INTERNAL__') db.set_cover({2: I('lt.png', data=True)}) t = time.time() + 1 + count # Ensure mtime changes, needed on OS X where HFS+ has a 1s # mtime resolution os.utime(cpath, (t, t)) r, data = get('cover', 1) self.ae(r.status, http_client.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(r.getheader('Content-Type'), 'image/jpeg') r, data = get('cover', 1) self.ae(r.status, http_client.OK) self.ae(data, db.cover(1)) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('cover', 3) self.ae(r.status, http_client.OK) # Auto generated cover r, data = get('thumb', 1) self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 60, 60)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1) self.ae(r.status, http_client.OK) self.ae(r.getheader('Used-Cache'), 'yes') r, data = get('thumb', 1, q='sz=100') self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') r, data = get('thumb', 1, q='sz=100x100') self.ae(r.status, http_client.OK) self.ae(r.getheader('Used-Cache'), 'yes') change_cover(1, 1) r, data = get('thumb', 1, q='sz=100') self.ae(r.status, http_client.OK) self.ae(identify(data), ('jpeg', 100, 100)) self.ae(r.getheader('Used-Cache'), 'no') # Test file sharing in cache r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = from_hex_unicode(r.getheader('Tempfile')) f, fdata = share_open(path, 'rb'), data # Now force an update change_cover(1) r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') path = from_hex_unicode(r.getheader('Tempfile')) f2, f2data = share_open(path, 'rb'), data # Do it again change_cover(2) r, data = get('cover', 2) self.ae(r.status, http_client.OK) self.ae(data, db.cover(2)) self.ae(r.getheader('Used-Cache'), 'no') self.ae(f.read(), fdata) self.ae(f2.read(), f2data) # Test serving of metadata as opf r, data = get('opf', 1) self.ae(r.status, http_client.OK) self.ae(r.getheader('Content-Type'), 'application/oebps-package+xml; charset=UTF-8') self.assertIsNotNone(r.getheader('Last-Modified')) opf = OPF(BytesIO(data), populate_spine=False, try_to_guess_cover=False) self.ae(db.field_for('title', 1), opf.title) self.ae(db.field_for('authors', 1), tuple(opf.authors)) conn.request('GET', '/get/opf/1', headers={'Accept-Encoding': 'gzip'}) r = conn.getresponse() self.ae(r.status, http_client.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16 + zlib.MAX_WBITS), data) # Test serving metadata as json r, data = get('json', 1) self.ae(r.status, http_client.OK) self.ae(db.field_for('title', 1), json.loads(data)['title']) conn.request('GET', '/get/json/1', headers={'Accept-Encoding': 'gzip'}) r = conn.getresponse() self.ae(r.status, http_client.OK), self.ae(r.getheader('Content-Encoding'), 'gzip') raw = r.read() self.ae(zlib.decompress(raw, 16 + zlib.MAX_WBITS), data)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == "inline": from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == "tag": from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory("_htmlz_output") as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = "index" if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(unicode(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname + ".html"), "wb") as tf: if isinstance(html, unicode): html = html.encode("utf-8") tf.write(html) # CSS if opts.htmlz_css_type == "class" and opts.htmlz_class_style == "external": with open(os.path.join(tdir, "style.css"), "wb") as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, "images")): os.makedirs(os.path.join(tdir, "images")) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = unicode(etree.tostring(item.data, encoding=unicode)) else: data = item.data fname = os.path.join(tdir, "images", images[item.href]) with open(fname, "wb") as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.magick.draw import save_cover_data_to cover_path = os.path.join(tdir, "cover.jpg") with open(cover_path, "w") as cf: cf.write("") save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, "metadata.opf"), "wb") as mdataf: opf = OPF(StringIO(etree.tostring(oeb_book.metadata.to_opf1()))) mi = opf.to_book_metadata() if cover_path: mi.cover = "cover.jpg" mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, "w") htmlz.add_dir(tdir)
def do_set_metadata(db, id, stream): mi = OPF(stream).to_book_metadata() db.set_metadata(id, mi)
def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') if input_fmt.lower() == 'htmlz': self.spine.append( Spiny(os.path.join(os.path.dirname(self.pathtoopf), 'index.html'), mime_type='text/html')) else: ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in { 'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz' }: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE % prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [ math.ceil(i / float(self.CHARACTERS_PER_PAGE)) for i in sizes ] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self
class EbookIterator(BookmarksMixin): CHARACTERS_PER_PAGE = 1000 def __init__(self, pathtoebook, log=None, copy_bookmarks_to_file=True, use_tdir_in_cache=False): BookmarksMixin.__init__(self, copy_bookmarks_to_file=copy_bookmarks_to_file) self.use_tdir_in_cache = use_tdir_in_cache self.log = log or default_log pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) self.ebook_ext = ext.replace('original_', '') def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr( getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition( '}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') if input_fmt.lower() == 'htmlz': self.spine.append( Spiny(os.path.join(os.path.dirname(self.pathtoopf), 'index.html'), mime_type='text/html')) else: ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in { 'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz' }: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE % prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [ math.ceil(i / float(self.CHARACTERS_PER_PAGE)) for i in sizes ] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self def verify_links(self): spine_paths = {s: s for s in self.spine} for item in self.spine: base = os.path.dirname(item) for link in item.all_links: try: p = urlparse(urlunquote(link)) except Exception: continue if not p.scheme and not p.netloc: path = os.path.abspath(os.path.join( base, p.path)) if p.path else item try: path = spine_paths[path] except Exception: continue if not p.fragment or p.fragment in path.anchor_map: item.verified_links.add((path, p.fragment)) def __exit__(self, *args): remove_dir(self._tdir) for x in self.delete_on_exit: try: os.remove(x) except: pass
def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except Exception: log.exception( "KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser" ) from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u"."): if ( f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".") ): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)" ).format(path) ) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = ( self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 ) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported") # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath(u"content.opf")
def add_to_db(self, data): from calibre.ebooks.metadata.opf2 import OPF gui = self.parent() if gui is None: return m = gui.library_view.model() count = 0 needs_rescan = False duplicates = [] added_ids = set() for fname, tdir in data.iteritems(): paths = [os.path.join(self.worker.path, fname)] sz = os.path.join(tdir, 'size.txt') try: with open(sz, 'rb') as f: sz = int(f.read()) if sz != os.stat(paths[0]).st_size: raise Exception('Looks like the file was written to after' ' we tried to read metadata') except: needs_rescan = True try: self.worker.staging.remove(fname) except KeyError: pass continue mi = os.path.join(tdir, 'metadata.opf') if not os.access(mi, os.R_OK): continue mi = [OPF(open(mi, 'rb'), tdir, populate_spine=False).to_book_metadata()] dups, ids = m.add_books(paths, [os.path.splitext(fname)[1][1:].upper()], mi, add_duplicates=not gprefs['auto_add_check_for_duplicates'], return_ids=True) added_ids |= set(ids) num = len(ids) if dups: path = dups[0][0] with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()), 'wb') as dest, open(path, 'rb') as src: shutil.copyfileobj(src, dest) dups[0][0] = dest.name duplicates.append(dups) try: os.remove(paths[0]) self.worker.staging.remove(fname) except: import traceback traceback.print_exc() count += num if duplicates: paths, formats, metadata = [], [], [] for p, f, mis in duplicates: paths.extend(p) formats.extend(f) metadata.extend(mis) dups = [(mic, mic.cover, [p]) for mic, p in zip(metadata, paths)] d = DuplicatesQuestion(m.db, dups, parent=gui) dups = tuple(d.duplicates) if dups: paths, formats, metadata = [], [], [] for mi, cover, book_paths in dups: paths.extend(book_paths) formats.extend([p.rpartition('.')[-1] for p in book_paths]) metadata.extend([mi for i in book_paths]) ids = m.add_books(paths, formats, metadata, add_duplicates=True, return_ids=True)[1] added_ids |= set(ids) num = len(ids) count += num for tdir in data.itervalues(): try: shutil.rmtree(tdir) except: pass if added_ids and gprefs['auto_add_auto_convert']: self.auto_convert.emit(added_ids) if count > 0: m.books_added(count) gui.status_bar.show_message(_( 'Added %(num)d book(s) automatically from %(src)s') % dict(num=count, src=self.worker.path), 2000) if hasattr(gui, 'db_images'): gui.db_images.beginResetModel(), gui.db_images.endResetModel() if needs_rescan: QTimer.singleShot(2000, self.dir_changed)
def convert(self, oeb_book, output, input_plugin, opts, log): self.report_version(log) #for mivals in oeb_book.metadata.items.values(): # for mival in mivals: # log.info("metadata: %s" % repr(mival)) try: book_name = str(oeb_book.metadata.title[0]) except Exception: book_name = "" asin = None if not tweaks.get("kfx_output_ignore_asin_metadata", False): for idre in ["^mobi-asin$", "^amazon.*$", "^asin$"]: for ident in oeb_book.metadata["identifier"]: idtype = ident.get(OPFNS("scheme"), "").lower() if re.match(idre, idtype) and re.match(ASIN_RE, ident.value): asin = ident.value log.info("Found ASIN metadata %s: %s" % (idtype, asin)) break if asin: break #with open(opts.read_metadata_from_opf, "rb") as opff: # log.info("opf: %s" % opff.read()) if opts.approximate_pages: page_count = 0 if opts.number_of_pages_field and opts.number_of_pages_field != AUTO_PAGES and opts.read_metadata_from_opf: # This OPF contains custom column metadata not present in the oeb_book OPF opf = OPF(opts.read_metadata_from_opf, populate_spine=False, try_to_guess_cover=False, read_toc=False) mi = opf.to_book_metadata() page_count_str = mi.get(opts.number_of_pages_field, None) if page_count_str is not None: try: page_count = int(page_count_str) except Exception: pass log.info("Page count value from field %s: %d ('%s')" % (opts.number_of_pages_field, page_count, page_count_str)) else: log.warning("Book has no page count field %s" % opts.number_of_pages_field) else: page_count = -1 #log.info("oeb_book contains %d pages" % len(oeb_book.pages.pages)) #log.info("options: %s" % str(opts.__dict__)) # set default values for options expected by the EPUB Output plugin for optrec in EPUBOutput.options: setattr(opts, optrec.option.name, optrec.recommended_value) # override currently known EPUB Output plugin options opts.extract_to = None opts.dont_split_on_page_breaks = False opts.flow_size = 0 opts.no_default_epub_cover = False opts.no_svg_cover = False opts.preserve_cover_aspect_ratio = True opts.epub_flatten = False opts.epub_inline_toc = False opts.epub_toc_at_end = False opts.toc_title = None epub_filename = self.temporary_file(".epub").name self.epub_output_plugin.convert(oeb_book, epub_filename, input_plugin, opts, log) # convert input format to EPUB log.info("Successfully converted input format to EPUB") if PREPARED_FILE_SAVE_DIR: if not os.path.exists(PREPARED_FILE_SAVE_DIR): os.makedirs(PREPARED_FILE_SAVE_DIR) prepared_file_path = os.path.join(PREPARED_FILE_SAVE_DIR, os.path.basename(epub_filename)) shutil.copyfile(epub_filename, prepared_file_path) log.warning("Saved conversion input file: %s" % prepared_file_path) self.convert_using_previewer( JobLog(log), book_name, epub_filename, asin, opts.cde_type_pdoc, page_count, opts.show_kpr_logs, False, TIMEOUT if opts.enable_timeout else None, output)
def convert(self, stream, options, file_ext, log, accelerators): log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('KEPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( _('%s is not a valid KEPUB file (could not find opf)') % path) encfile = os.path.abspath('rights.xml') if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' for elem in opf.itermanifest(): elem.set('href', delta + elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta + elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else \ self.rationalize_cover2 self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( _('EPUB files with DTBook markup are not supported')) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError(_('No valid entries in the spine of this EPUB')) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def convert(self, stream, options, file_ext, log, accelerators): log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('KEPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( _('%s is not a valid KEPUB file (could not find opf)') % path) encfile = os.path.abspath('rights.xml') if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' for elem in opf.itermanifest(): elem.set('href', delta + elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta + elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else \ self.rationalize_cover2 self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError(_( 'EPUB files with DTBook markup are not supported')) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError(_('No valid entries in the spine of this EPUB')) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def add_to_db(self, data): from calibre.ebooks.metadata.opf2 import OPF gui = self.parent() if gui is None: return m = gui.library_view.model() count = 0 needs_rescan = False duplicates = [] added_ids = set() for fname, tdir in data.iteritems(): paths = [os.path.join(self.worker.path, fname)] sz = os.path.join(tdir, 'size.txt') try: with open(sz, 'rb') as f: sz = int(f.read()) if sz != os.stat(paths[0]).st_size: raise Exception('Looks like the file was written to after' ' we tried to read metadata') except: needs_rescan = True try: self.worker.staging.remove(fname) except KeyError: pass continue mi = os.path.join(tdir, 'metadata.opf') if not os.access(mi, os.R_OK): continue mi = [ OPF(open(mi, 'rb'), tdir, populate_spine=False).to_book_metadata() ] dups, ids = m.add_books( paths, [os.path.splitext(fname)[1][1:].upper()], mi, add_duplicates=not gprefs['auto_add_check_for_duplicates'], return_ids=True) added_ids |= set(ids) num = len(ids) if dups: path = dups[0][0] with open( os.path.join(tdir, 'dup_cache.' + dups[1][0].lower()), 'wb') as dest, open(path, 'rb') as src: shutil.copyfileobj(src, dest) dups[0][0] = dest.name duplicates.append(dups) try: os.remove(paths[0]) self.worker.staging.remove(fname) except: pass count += num if duplicates: paths, formats, metadata = [], [], [] for p, f, mis in duplicates: paths.extend(p) formats.extend(f) metadata.extend(mis) files = [ _('%(title)s by %(author)s') % dict(title=mi.title, author=mi.format_field('authors')[1]) for mi in metadata ] if question_dialog( self.parent(), _('Duplicates found!'), _('Books with the same title as the following already ' 'exist in the database. Add them anyway?'), '\n'.join(files)): dups, ids = m.add_books(paths, formats, metadata, add_duplicates=True, return_ids=True) added_ids |= set(ids) num = len(ids) count += num for tdir in data.itervalues(): try: shutil.rmtree(tdir) except: pass if added_ids and gprefs['auto_add_auto_convert']: self.auto_convert.emit(added_ids) if count > 0: m.books_added(count) gui.status_bar.show_message( _('Added %(num)d book(s) automatically from %(src)s') % dict(num=count, src=self.worker.path), 2000) if hasattr(gui, 'db_images'): gui.db_images.reset() if needs_rescan: QTimer.singleShot(2000, self.dir_changed)
class EbookIterator(BookmarksMixin): CHARACTERS_PER_PAGE = 1000 def __init__(self, pathtoebook, log=None, copy_bookmarks_to_file=True, use_tdir_in_cache=False): BookmarksMixin.__init__(self, copy_bookmarks_to_file=copy_bookmarks_to_file) self.use_tdir_in_cache = use_tdir_in_cache self.log = log or default_log pathtoebook = pathtoebook.strip() self.pathtoebook = os.path.abspath(pathtoebook) self.config = DynamicConfig(name='iterator') ext = os.path.splitext(pathtoebook)[1].replace('.', '').lower() ext = re.sub(r'(x{0,1})htm(l{0,1})', 'html', ext) self.ebook_ext = ext.replace('original_', '') def search(self, text, index, backwards=False): from calibre.ebooks.oeb.polish.parsing import parse pmap = [(i, path) for i, path in enumerate(self.spine)] if backwards: pmap.reverse() q = text.lower() for i, path in pmap: if (backwards and i < index) or (not backwards and i > index): with open(path, 'rb') as f: raw = f.read().decode(path.encoding) root = parse(raw) fragments = [] def serialize(elem): if elem.text: fragments.append(elem.text.lower()) if elem.tail: fragments.append(elem.tail.lower()) for child in elem.iterchildren(): if hasattr(getattr(child, 'tag', None), 'rpartition') and child.tag.rpartition('}')[-1] not in {'script', 'style', 'del'}: serialize(child) elif getattr(child, 'tail', None): fragments.append(child.tail.lower()) for body in root.xpath('//*[local-name() = "body"]'): body.tail = None serialize(body) if q in ''.join(fragments): return i def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, view_kepub=False, read_links=True): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' self.delete_on_exit = [] if self.use_tdir_in_cache: self._tdir = tdir_in_cache('ev') else: self._tdir = PersistentTemporaryDirectory('_ebook_iter') self.base = os.path.realpath(self._tdir) self.book_format, self.pathtoopf, input_fmt = run_extract_book( self.pathtoebook, self.base, only_input_plugin=only_input_plugin, view_kepub=view_kepub, processed=processed) self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.mi = self.opf.to_book_metadata() self.language = None if self.mi.languages: self.language = self.mi.languages[0].lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, read_links=read_links, run_char_count=run_char_count, from_epub=self.book_format == 'EPUB') is_comic = input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in {'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3', 'docx', 'htmlz'}: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE%prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [math.ceil(i/float(self.CHARACTERS_PER_PAGE)) for i in sizes] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.verify_links() self.read_bookmarks() return self def verify_links(self): spine_paths = {s:s for s in self.spine} for item in self.spine: base = os.path.dirname(item) for link in item.all_links: try: p = urlparse(urlunquote(link)) except Exception: continue if not p.scheme and not p.netloc: path = os.path.abspath(os.path.join(base, p.path)) if p.path else item try: path = spine_paths[path] except Exception: continue if not p.fragment or p.fragment in path.anchor_map: item.verified_links.add((path, p.fragment)) def __exit__(self, *args): remove_dir(self._tdir) for x in self.delete_on_exit: try: os.remove(x) except: pass
def add_to_db(self, data): from calibre.ebooks.metadata.opf2 import OPF gui = self.parent() if gui is None: return m = gui.library_view.model() count = 0 needs_rescan = False duplicates = [] added_ids = set() for fname, tdir in data.iteritems(): paths = [os.path.join(self.worker.path, fname)] sz = os.path.join(tdir, 'size.txt') try: with open(sz, 'rb') as f: sz = int(f.read()) if sz != os.stat(paths[0]).st_size: raise Exception('Looks like the file was written to after' ' we tried to read metadata') except: needs_rescan = True try: self.worker.staging.remove(fname) except KeyError: pass continue mi = os.path.join(tdir, 'metadata.opf') if not os.access(mi, os.R_OK): continue mi = OPF(open(mi, 'rb'), tdir, populate_spine=False).to_book_metadata() if gprefs.get('tag_map_on_add_rules'): from calibre.ebooks.metadata.tag_mapper import map_tags mi.tags = map_tags(mi.tags, gprefs['tag_map_on_add_rules']) mi = [mi] dups, ids = m.add_books(paths, [os.path.splitext(fname)[1][1:].upper()], mi, add_duplicates=not gprefs['auto_add_check_for_duplicates'], return_ids=True) added_ids |= set(ids) num = len(ids) if dups: path = dups[0][0] with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()), 'wb') as dest, open(path, 'rb') as src: shutil.copyfileobj(src, dest) dups[0][0] = dest.name duplicates.append(dups) try: os.remove(paths[0]) self.worker.staging.remove(fname) except: import traceback traceback.print_exc() count += num if duplicates: paths, formats, metadata = [], [], [] for p, f, mis in duplicates: paths.extend(p) formats.extend(f) metadata.extend(mis) dups = [(mic, mic.cover, [p]) for mic, p in zip(metadata, paths)] d = DuplicatesQuestion(m.db, dups, parent=gui) dups = tuple(d.duplicates) if dups: paths, formats, metadata = [], [], [] for mi, cover, book_paths in dups: paths.extend(book_paths) formats.extend([p.rpartition('.')[-1] for p in book_paths]) metadata.extend([mi for i in book_paths]) ids = m.add_books(paths, formats, metadata, add_duplicates=True, return_ids=True)[1] added_ids |= set(ids) num = len(ids) count += num for tdir in data.itervalues(): try: shutil.rmtree(tdir) except: pass if added_ids and gprefs['auto_add_auto_convert']: self.auto_convert.emit(added_ids) if count > 0: m.books_added(count) gui.status_bar.show_message(_( 'Added %(num)d book(s) automatically from %(src)s') % dict(num=count, src=self.worker.path), 2000) gui.refresh_cover_browser() if needs_rescan: QTimer.singleShot(2000, self.dir_changed)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf from calibre.utils.zipfile import ZipFile from calibre.utils.filenames import ascii_filename # HTML if opts.htmlz_css_type == 'inline': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer OEB2HTMLizer = OEB2HTMLInlineCSSizer elif opts.htmlz_css_type == 'tag': from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer OEB2HTMLizer = OEB2HTMLNoCSSizer else: from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer with TemporaryDirectory(u'_htmlz_output') as tdir: htmlizer = OEB2HTMLizer(log) html = htmlizer.oeb2html(oeb_book, opts) fname = u'index' if opts.htmlz_title_filename: from calibre.utils.filenames import shorten_components_to fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0] with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf: if isinstance(html, unicode_type): html = html.encode('utf-8') tf.write(html) # CSS if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external': with open(os.path.join(tdir, u'style.css'), 'wb') as tf: tf.write(htmlizer.get_css(oeb_book)) # Images images = htmlizer.images if images: if not os.path.exists(os.path.join(tdir, u'images')): os.makedirs(os.path.join(tdir, u'images')) for item in oeb_book.manifest: if item.media_type in OEB_IMAGES and item.href in images: if item.media_type == SVG_MIME: data = etree.tostring(item.data, encoding='unicode') else: data = item.data fname = os.path.join(tdir, u'images', images[item.href]) with open(fname, 'wb') as img: img.write(data) # Cover cover_path = None try: cover_data = None if oeb_book.metadata.cover: term = oeb_book.metadata.cover[0].term cover_data = oeb_book.guide[term].item.data if cover_data: from calibre.utils.img import save_cover_data_to cover_path = os.path.join(tdir, u'cover.jpg') with lopen(cover_path, 'w') as cf: cf.write('') save_cover_data_to(cover_data, cover_path) except: import traceback traceback.print_exc() # Metadata with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf: opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8'))) mi = opf.to_book_metadata() if cover_path: mi.cover = u'cover.jpg' mdataf.write(metadata_to_opf(mi)) htmlz = ZipFile(output_path, 'w') htmlz.add_dir(tdir)
def __enter__(self, processed=False, only_input_plugin=False, run_char_count=True, read_anchor_map=True, extract_embedded_fonts_for_qt=False): ''' Convert an ebook file into an exploded OEB book suitable for display in viewers/preprocessing etc. ''' from calibre.ebooks.conversion.plumber import Plumber, create_oebbook self.delete_on_exit = [] self._tdir = TemporaryDirectory('_ebook_iter') self.base = self._tdir.__enter__() plumber = Plumber(self.pathtoebook, self.base, self.log) plumber.setup_options() if self.pathtoebook.lower().endswith('.opf'): plumber.opts.dont_package = True if hasattr(plumber.opts, 'no_process'): plumber.opts.no_process = True plumber.input_plugin.for_viewer = True with plumber.input_plugin, open(plumber.input, 'rb') as inf: self.pathtoopf = plumber.input_plugin(inf, plumber.opts, plumber.input_fmt, self.log, {}, self.base) if not only_input_plugin: # Run the HTML preprocess/parsing from the conversion pipeline as # well if (processed or plumber.input_fmt.lower() in {'pdb', 'pdf', 'rb'} and not hasattr(self.pathtoopf, 'manifest')): if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.pathtoopf = create_oebbook(self.log, self.pathtoopf, plumber.opts) if hasattr(self.pathtoopf, 'manifest'): self.pathtoopf = write_oebbook(self.pathtoopf, self.base) self.book_format = os.path.splitext(self.pathtoebook)[1][1:].upper() if getattr(plumber.input_plugin, 'is_kf8', False): self.book_format = 'KF8' self.opf = getattr(plumber.input_plugin, 'optimize_opf_parsing', None) if self.opf is None: self.opf = OPF(self.pathtoopf, os.path.dirname(self.pathtoopf)) self.language = self.opf.language if self.language: self.language = self.language.lower() ordered = [i for i in self.opf.spine if i.is_linear] + \ [i for i in self.opf.spine if not i.is_linear] self.spine = [] Spiny = partial(SpineItem, read_anchor_map=read_anchor_map, run_char_count=run_char_count) is_comic = plumber.input_fmt.lower() in {'cbc', 'cbz', 'cbr', 'cb7'} for i in ordered: spath = i.path mt = None if i.idref is not None: mt = self.opf.manifest.type_for_id(i.idref) if mt is None: mt = guess_type(spath)[0] try: self.spine.append(Spiny(spath, mime_type=mt)) if is_comic: self.spine[-1].is_single_page = True except: self.log.warn('Missing spine item:', repr(spath)) cover = self.opf.cover if cover and self.ebook_ext in { 'lit', 'mobi', 'prc', 'opf', 'fb2', 'azw', 'azw3' }: cfile = os.path.join(self.base, 'calibre_iterator_cover.html') rcpath = os.path.relpath(cover, self.base).replace(os.sep, '/') chtml = (TITLEPAGE % prepare_string_for_xml(rcpath, True)).encode('utf-8') with open(cfile, 'wb') as f: f.write(chtml) self.spine[0:0] = [Spiny(cfile, mime_type='application/xhtml+xml')] self.delete_on_exit.append(cfile) if self.opf.path_to_html_toc is not None and \ self.opf.path_to_html_toc not in self.spine: try: self.spine.append(Spiny(self.opf.path_to_html_toc)) except: import traceback traceback.print_exc() sizes = [i.character_count for i in self.spine] self.pages = [ math.ceil(i / float(self.CHARACTERS_PER_PAGE)) for i in sizes ] for p, s in zip(self.pages, self.spine): s.pages = p start = 1 for s in self.spine: s.start_page = start start += s.pages s.max_page = s.start_page + s.pages - 1 self.toc = self.opf.toc if read_anchor_map: create_indexing_data(self.spine, self.toc) self.read_bookmarks() if extract_embedded_fonts_for_qt: from calibre.ebooks.oeb.iterator.extract_fonts import extract_fonts try: extract_fonts(self.opf, self.log) except: ol = self.log.filter_level self.log.filter_level = self.log.DEBUG self.log.exception('Failed to extract fonts') self.log.filter_level = ol return self
def get_metadata2(root, ver): opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item( )
def do_add(self, data): from calibre.ebooks.metadata.opf2 import OPF gui = self.parent() if gui is None: return m = gui.library_view.model() count = 0 needs_rescan = False duplicates = [] added_ids = set() for fname, tdir in data: path_to_remove = os.path.join(self.worker.path, fname) paths = [path_to_remove] fpath = os.path.join(tdir, 'file_changed_by_plugins') if os.path.exists(fpath): with open(fpath) as f: paths[0] = f.read() sz = os.path.join(tdir, 'size.txt') try: with open(sz, 'rb') as f: sz = int(f.read()) if sz != os.stat(paths[0]).st_size: raise Exception('Looks like the file was written to after' ' we tried to read metadata') except: needs_rescan = True try: self.worker.staging.remove(fname) except KeyError: pass continue mi = os.path.join(tdir, 'metadata.opf') if not os.access(mi, os.R_OK): continue mi = OPF(open(mi, 'rb'), tdir, populate_spine=False).to_book_metadata() if gprefs.get('tag_map_on_add_rules'): from calibre.ebooks.metadata.tag_mapper import map_tags mi.tags = map_tags(mi.tags, gprefs['tag_map_on_add_rules']) if gprefs.get('author_map_on_add_rules'): from calibre.ebooks.metadata.author_mapper import ( compile_rules, map_authors ) new_authors = map_authors(mi.authors, compile_rules(gprefs['author_map_on_add_rules'])) if new_authors != mi.authors: mi.authors = new_authors mi.author_sort = gui.current_db.new_api.author_sort_from_authors(mi.authors) mi = [mi] dups, ids = m.add_books(paths, [os.path.splitext(fname)[1][1:].upper()], mi, add_duplicates=not gprefs['auto_add_check_for_duplicates'], return_ids=True) added_ids |= set(ids) num = len(ids) if dups: path = dups[0][0] with open(os.path.join(tdir, 'dup_cache.'+dups[1][0].lower()), 'wb') as dest, open(path, 'rb') as src: shutil.copyfileobj(src, dest) dups[0][0] = dest.name duplicates.append(dups) try: os.remove(path_to_remove) self.worker.staging.remove(fname) except: import traceback traceback.print_exc() count += num if duplicates: paths, formats, metadata = [], [], [] for p, f, mis in duplicates: paths.extend(p) formats.extend(f) metadata.extend(mis) dups = [(mic, mic.cover, [p]) for mic, p in zip(metadata, paths)] d = DuplicatesQuestion(m.db, dups, parent=gui) dups = tuple(d.duplicates) if dups: paths, formats, metadata = [], [], [] for mi, cover, book_paths in dups: paths.extend(book_paths) formats.extend([p.rpartition('.')[-1] for p in book_paths]) metadata.extend([mi for i in book_paths]) ids = m.add_books(paths, formats, metadata, add_duplicates=True, return_ids=True)[1] added_ids |= set(ids) num = len(ids) count += num for fname, tdir in data: try: shutil.rmtree(tdir) except: pass if added_ids and gprefs['auto_add_auto_convert']: self.auto_convert.emit(added_ids) if count > 0: m.books_added(count) gui.status_bar.show_message( (_('Added a book automatically from {src}') if count == 1 else _('Added {num} books automatically from {src}')).format( num=count, src=self.worker.path), 2000) gui.refresh_cover_browser() if needs_rescan: QTimer.singleShot(2000, self.dir_changed)
def set_metadata_opf2(root, cover_prefix, mi, opf_version, cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False, add_missing_cover=True): mi = MetaInformation(mi) for x in ('guide', 'toc', 'manifest', 'spine'): setattr(mi, x, None) opf = OPF(None, preparsed_opf=root, read_toc=False) if mi.languages: mi.languages = normalize_languages( list(opf.raw_languages) or [], mi.languages) opf.smart_update(mi, apply_null=apply_null) if getattr(mi, 'uuid', None): opf.application_id = mi.uuid if apply_null or force_identifiers: opf.set_identifiers(mi.get_identifiers()) else: orig = opf.get_identifiers() orig.update(mi.get_identifiers()) opf.set_identifiers({k: v for k, v in orig.iteritems() if k and v}) if update_timestamp and mi.timestamp is not None: opf.timestamp = mi.timestamp raster_cover = opf.raster_cover if raster_cover is None and cover_data is not None and add_missing_cover: guide_raster_cover = opf.guide_raster_cover i = None if guide_raster_cover is not None: i = guide_raster_cover raster_cover = i.get('href') else: if cover_prefix and not cover_prefix.endswith('/'): cover_prefix += '/' name = cover_prefix + 'cover.jpg' i = create_manifest_item(opf.root, name, 'cover') if i is not None: raster_cover = name if i is not None: if opf_version.major < 3: [ x.getparent().remove(x) for x in opf.root.xpath( '//*[local-name()="meta" and @name="cover"]') ] m = opf.create_metadata_element('meta', is_dc=False) m.set('name', 'cover'), m.set('content', i.get('id')) else: for x in opf.root.xpath( '//*[local-name()="item" and contains(@properties, "cover-image")]' ): x.set( 'properties', x.get('properties').replace('cover-image', '').strip()) i.set('properties', 'cover-image') with pretty_print: return opf.render(), raster_cover
def convert(self, stream, options, file_ext, log, accelerators): from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.zipfile import ZipFile self.log = log html = u'' top_levels = [] # Extract content from zip archive. zf = ZipFile(stream) zf.extractall() # Find the HTML file in the archive. It needs to be # top level. index = u'' multiple_html = False # Get a list of all top level files in the archive. for x in os.listdir(u'.'): if os.path.isfile(x): top_levels.append(x) # Try to find an index. file. for x in top_levels: if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): # Set index to the first HTML file found if it's not # called index. if not index: index = x else: multiple_html = True # Warn the user if there multiple HTML file in the archive. HTMLZ # supports a single HTML file. A conversion with a multiple HTML file # HTMLZ archive probably won't turn out as the user expects. With # Multiple HTML files ZIP input should be used in place of HTMLZ. if multiple_html: log.warn( _('Multiple HTML files found in the archive. Only %s will be used.' ) % index) if index: with open(index, 'rb') as tf: html = tf.read() else: raise Exception(_('No top level HTML file found.')) if not html: raise Exception(_('Top level HTML file %s is empty') % index) # Encoding if options.input_encoding: ienc = options.input_encoding else: ienc = xml_to_unicode(html[:4096])[-1] html = html.decode(ienc, 'replace') # Run the HTML through the html processing plugin. from calibre.customize.ui import plugin_for_input_format html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) options.input_encoding = 'utf-8' base = getcwd() fname = os.path.join(base, u'index.html') c = 0 while os.path.exists(fname): c += 1 fname = u'index%d.html' % c htmlfile = open(fname, 'wb') with htmlfile: htmlfile.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. oeb = html_input.convert(open(htmlfile.name, 'rb'), options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile.name) # Set metadata from file. from calibre.customize.ui import get_file_type_metadata from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata mi = get_file_type_metadata(stream, file_ext) meta_info_to_oeb_metadata(mi, oeb.metadata, log) # Get the cover path from the OPF. cover_path = None opf = None for x in top_levels: if os.path.splitext(x)[1].lower() == u'.opf': opf = x break if opf: opf = OPF(opf, basedir=getcwd()) cover_path = opf.raster_cover or opf.cover # Set the cover. if cover_path: cdata = None with open(os.path.join(getcwd(), cover_path), 'rb') as cf: cdata = cf.read() cover_name = os.path.basename(cover_path) id, href = oeb.manifest.generate('cover', cover_name) oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata) oeb.guide.add('cover', 'Cover', href) return oeb
def get_metadata2(root, ver): opf = OPF(None, preparsed_opf=root, read_toc=False) return opf.to_book_metadata(), ver, opf.raster_cover, opf.first_spine_item()
def process_result(self, group_id, result): if result.err: mi = self.report_metadata_failure(group_id, result.traceback) paths = self.file_groups[group_id] has_cover = False duplicate_info = set() if self.add_formats_to_existing else False else: paths, opf, has_cover, duplicate_info = result.value try: mi = OPF(BytesIO(opf), basedir=self.tdir, populate_spine=False, try_to_guess_cover=False).to_book_metadata() mi.read_metadata_failed = False except Exception: mi = self.report_metadata_failure(group_id, traceback.format_exc()) if mi.is_null('title'): for path in paths: mi.title = os.path.splitext(os.path.basename(path))[0] break if mi.application_id == '__calibre_dummy__': mi.application_id = None if gprefs.get('tag_map_on_add_rules'): from calibre.ebooks.metadata.tag_mapper import map_tags mi.tags = map_tags(mi.tags, gprefs['tag_map_on_add_rules']) if self.author_map_rules: from calibre.ebooks.metadata.author_mapper import map_authors new_authors = map_authors(mi.authors, self.author_map_rules) if new_authors != mi.authors: mi.authors = new_authors if self.db is None: mi.author_sort = authors_to_sort_string(mi.authors) else: mi.author_sort = self.db.author_sort_from_authors( mi.authors) self.pd.msg = mi.title cover_path = os.path.join(self.tdir, '%s.cdata' % group_id) if has_cover else None if self.db is None: if paths: self.items.append((mi, cover_path, paths)) return if self.add_formats_to_existing: identical_book_ids = find_identical_books( mi, self.find_identical_books_data) if identical_book_ids: try: self.merge_books(mi, cover_path, paths, identical_book_ids) except Exception: a = self.report.append a(''), a('-' * 70) a(_('Failed to merge the book: ') + mi.title) [a('\t' + f) for f in paths] a(_('With error:')), a(traceback.format_exc()) else: self.add_book(mi, cover_path, paths) else: if duplicate_info or icu_lower( mi.title or _('Unknown')) in self.added_duplicate_info: self.duplicates.append((mi, cover_path, paths)) else: self.add_book(mi, cover_path, paths)
def add(self, id, opf, cover, name): formats = self.ids.pop(id) if opf.endswith('.error'): mi = MetaInformation('', [_('Unknown')]) self.critical[name] = open(opf, 'rb').read().decode('utf-8', 'replace') else: try: mi = OPF(opf).to_book_metadata() except: import traceback mi = MetaInformation('', [_('Unknown')]) self.critical[name] = traceback.format_exc() formats = self.process_formats(opf, formats) if not mi.title: mi.title = os.path.splitext(name)[0] mi.title = mi.title if isinstance(mi.title, unicode) else \ mi.title.decode(preferred_encoding, 'replace') if mi.application_id == '__calibre_dummy__': mi.application_id = None if self.db is not None: if cover: with open(cover, 'rb') as f: cover = f.read() orig_formats = formats formats = [f for f in formats if not f.lower().endswith('.opf')] if prefs['add_formats_to_existing']: #automerge is on identical_book_list = self.db.find_identical_books(mi) if identical_book_list: # books with same author and nearly same title exist in db self.merged_books.add(mi.title) seen_fmts = set([]) for identical_book in identical_book_list: ib_fmts = self.db.formats(identical_book, index_is_id=True) if ib_fmts: seen_fmts |= set(ib_fmts.split(',')) replace = gprefs['automerge'] == 'overwrite' self.add_formats(identical_book, formats, replace=replace) if gprefs['automerge'] == 'new record': incoming_fmts = \ set([os.path.splitext(path)[-1].replace('.', '').upper() for path in formats]) if incoming_fmts.intersection(seen_fmts): # There was at least one duplicate format # so create a new record and put the # incoming formats into it # We should arguably put only the duplicate # formats, but no real harm is done by having # all formats id_ = self.db.create_book_entry( mi, cover=cover, add_duplicates=True) self.number_of_books_added += 1 self.add_formats(id_, formats) else: # books with same author and nearly same title do not exist in db id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=True) self.number_of_books_added += 1 self.add_formats(id_, formats) else: #automerge is off id_ = self.db.create_book_entry(mi, cover=cover, add_duplicates=False) if id_ is None: self.duplicates.append((mi, cover, orig_formats)) else: self.add_formats(id_, formats) self.number_of_books_added += 1 else: self.names.append(name) self.paths.append(formats[0]) self.infos.append(mi) return mi.title