def update_cover(self): # get the calibre cover cover_path = self.db.cover(self.book_id, as_path=True) fmt = cover_path.rpartition('.')[-1] new_cover_name = "00000000_cover." + fmt self.make_temp_cbz_file() # search for a previously embeded cover zf = ZipFile(self.file) cover_info = "" for name in zf.namelist(): if name.rsplit(".", 1)[0] == "00000000_cover": cover_info = name break # delete previous cover if cover_info != "": with open(self.file, 'r+b') as zf, open(cover_path, 'r+b') as cp: safe_replace(zf, cover_info, cp) # save the cover in the file else: zf = ZipFile(self.file, "a") zf.write(cover_path, new_cover_name) zf.close() delete_temp_file(cover_path)
def get_metadata(stream): from calibre.ebooks.metadata.meta import get_metadata from calibre.ebooks.metadata.archive import is_comic stream_type = None zf = ZipFile(stream, 'r') names = zf.namelist() if is_comic(names): # Is probably a comic return get_metadata(stream, 'cbz') for f in names: stream_type = os.path.splitext(f)[1].lower() if stream_type: stream_type = stream_type[1:] if stream_type in ('lit', 'opf', 'prc', 'mobi', 'fb2', 'epub', 'rb', 'imp', 'pdf', 'lrf', 'azw', 'azw1', 'azw3'): with TemporaryDirectory() as tdir: with CurrentDir(tdir): path = zf.extract(f) mi = get_metadata(open(path,'rb'), stream_type) if stream_type == 'opf' and mi.application_id is None: try: # zip archive opf files without an application_id were assumed not to have a cover # reparse the opf and if cover exists read its data from zip archive for the metadata nmi = zip_opf_metadata(path, zf) nmi.timestamp = None return nmi except: pass mi.timestamp = None return mi raise ValueError('No ebook found in ZIP archive (%s)' % os.path.basename(getattr(stream, 'name', '') or '<stream>'))
def get_picture_size(self): from calibre.utils.magick import Image self.make_temp_cbz_file() zf = ZipFile(self.file) files = zf.namelist() size_x, size_y = 0, 0 index = 1 while index < 10 and index < len(files): fname = files[index] if fname.lower().rpartition('.')[-1] in IMG_EXTENSIONS: with zf.open(fname) as ffile: img = Image() try: img.open(ffile) size_x, size_y = img.size except: pass if size_x < size_y: break index += 1 zf.close() size = round(size_x * size_y / 1000000, 2) return size
def update_cover(self): # get the calibre cover cover_path = self.db.cover(self.book_id, as_path=True) fmt = cover_path.rpartition('.')[-1] new_cover_name = "00000000_cover." + fmt self.make_temp_cbz_file() # search for a previously embeded cover zf = ZipFile(self.file) cover_info = "" for name in zf.namelist(): if name.rsplit(".", 1)[0] == "00000000_cover": cover_info = name break zf.close() # delete previous cover if cover_info != "": with open(self.file, 'r+b') as zf, open(cover_path, 'r+b') as cp: safe_replace(zf, cover_info, cp) # save the cover in the file else: zf = ZipFile(self.file, "a") zf.write(cover_path, new_cover_name) zf.close() delete_temp_file(cover_path)
def __enter__(self, *args): ''' Add this plugin to the python path so that it's contents become directly importable. Useful when bundling large python libraries into the plugin. Use it like this:: with plugin: import something ''' if self.plugin_path is not None: from calibre.utils.zipfile import ZipFile zf = ZipFile(self.plugin_path) extensions = {x.rpartition('.')[-1].lower() for x in zf.namelist()} zip_safe = True for ext in ('pyd', 'so', 'dll', 'dylib'): if ext in extensions: zip_safe = False break if zip_safe: sys.path.insert(0, self.plugin_path) self.sys_insertion_path = self.plugin_path else: from calibre.ptempfile import TemporaryDirectory self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip') self.sys_insertion_path = self._sys_insertion_tdir.__enter__( *args) zf.extractall(self.sys_insertion_path) sys.path.insert(0, self.sys_insertion_path) zf.close()
def get_comic_metadata_from_cbz(self): ''' Reads the comic metadata from the comic cbz file as comictagger metadata ''' self.make_temp_cbz_file() # open the zipfile zf = ZipFile(self.file) # get cix metadata for name in zf.namelist(): if name.lower() == "comicinfo.xml": self.cix_metadata = ComicInfoXml().metadataFromString( zf.read(name)) self.zipinfo = name if not prefs['auto_count_pages']: break elif prefs['auto_count_pages'] and name.lower().rpartition( '.')[-1] in IMG_EXTENSIONS: self.pages += 1 # get the cbi metadata if ComicBookInfo().validateString(zf.comment): self.cbi_metadata = ComicBookInfo().metadataFromString(zf.comment) zf.close() # get combined metadata self._get_combined_metadata()
def __enter__(self, *args): ''' Add this plugin to the python path so that it's contents become directly importable. Useful when bundling large python libraries into the plugin. Use it like this:: with plugin: import something ''' if self.plugin_path is not None: from calibre.utils.zipfile import ZipFile zf = ZipFile(self.plugin_path) extensions = set([x.rpartition('.')[-1].lower() for x in zf.namelist()]) zip_safe = True for ext in ('pyd', 'so', 'dll', 'dylib'): if ext in extensions: zip_safe = False break if zip_safe: sys.path.insert(0, self.plugin_path) self.sys_insertion_path = self.plugin_path else: from calibre.ptempfile import TemporaryDirectory self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip') self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args) zf.extractall(self.sys_insertion_path) sys.path.insert(0, self.sys_insertion_path) zf.close()
def count_pages(self): self.make_temp_cbz_file() zf = ZipFile(self.file) pages = 0 for name in zf.namelist(): if name.lower().rpartition('.')[-1] in IMG_EXTENSIONS: pages += 1 return pages
def run(self, archive): from calibre.utils.zipfile import ZipFile is_rar = archive.lower().endswith('.rar') if is_rar: from calibre.utils.unrar import extract_member, names else: zf = ZipFile(archive, 'r') if is_rar: with open(archive, 'rb') as rf: fnames = list(names(rf)) else: fnames = zf.namelist() def fname_ok(fname): bn = os.path.basename(fname).lower() if bn == 'thumbs.db': return False if '.' not in bn: return False if bn.rpartition('.')[-1] in {'diz', 'nfo'}: return False if '__MACOSX' in fname.split('/'): return False return True fnames = list(filter(fname_ok, fnames)) if is_comic(fnames): ext = '.cbr' if is_rar else '.cbz' of = self.temporary_file('_archive_extract' + ext) with open(archive, 'rb') as f: of.write(f.read()) of.close() return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in { 'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt' }: return archive of = self.temporary_file('_archive_extract.' + ext) with closing(of): if is_rar: with open(archive, 'rb') as f: data = extract_member(f, match=None, name=fname)[1] of.write(data) else: of.write(zf.read(fname)) return of.name
def run(self, archive): from calibre.utils.zipfile import ZipFile is_rar = archive.lower().endswith('.rar') if is_rar: from calibre.utils.unrar import extract_member, names else: zf = ZipFile(archive, 'r') if is_rar: with open(archive, 'rb') as rf: fnames = list(names(rf)) else: fnames = zf.namelist() def fname_ok(fname): bn = os.path.basename(fname).lower() if bn == 'thumbs.db': return False if '.' not in bn: return False if bn.rpartition('.')[-1] in {'diz', 'nfo'}: return False if '__MACOSX' in fname.split('/'): return False return True fnames = list(filter(fname_ok, fnames)) if is_comic(fnames): ext = '.cbr' if is_rar else '.cbz' of = self.temporary_file('_archive_extract'+ext) with open(archive, 'rb') as f: of.write(f.read()) of.close() return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in { 'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'}: return archive of = self.temporary_file('_archive_extract.'+ext) with closing(of): if is_rar: with open(archive, 'rb') as f: data = extract_member(f, match=None, name=fname)[1] of.write(data) else: of.write(zf.read(fname)) return of.name
def get_fb2_data(stream): from calibre.utils.zipfile import ZipFile, BadZipfile pos = stream.tell() try: zf = ZipFile(stream) except BadZipfile: stream.seek(pos) ans = stream.read() zip_file_name = None else: names = zf.namelist() names = [x for x in names if x.lower().endswith('.fb2')] or names zip_file_name = names[0] ans = zf.open(zip_file_name).read() return ans, zip_file_name
def count_pages(self): self.make_temp_cbz_file() # open the zipfile zf = ZipFile(self.file) # count the pages for name in zf.namelist(): if name.lower().rpartition('.')[-1] in IMG_EXTENSIONS: self.pages += 1 if self.pages == 0: return False update_custom_column(prefs['pages_column'], self.pages, self.calibre_metadata, self.db.field_metadata.custom_field_metadata()) self.db.set_metadata(self.book_id, self.calibre_metadata) return True
def run(self, archive): from calibre.utils.zipfile import ZipFile is_rar = archive.lower().endswith('.rar') if is_rar: from calibre.utils.unrar import extract_member, names else: zf = ZipFile(archive, 'r') if is_rar: with open(archive, 'rb') as rf: fnames = list(names(rf)) else: fnames = zf.namelist() fnames = [ x for x in fnames if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db' ] if is_comic(fnames): ext = '.cbr' if is_rar else '.cbz' of = self.temporary_file('_archive_extract' + ext) with open(archive, 'rb') as f: of.write(f.read()) of.close() return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'): return archive of = self.temporary_file('_archive_extract.' + ext) with closing(of): if is_rar: with open(archive, 'rb') as f: data = extract_member(f, match=None, name=fname)[1] of.write(data) else: of.write(zf.read(fname)) return of.name
def __enter__(self, *args): if self.plugin_path is not None: from calibre.utils.zipfile import ZipFile zf = ZipFile(self.plugin_path) extensions = set([x.rpartition('.')[-1].lower() for x in zf.namelist()]) zip_safe = True for ext in ('pyd', 'so', 'dll', 'dylib'): if ext in extensions: zip_safe = False if zip_safe: sys.path.insert(0, self.plugin_path) self.sys_insertion_path = self.plugin_path else: from calibre.ptempfile import TemporaryDirectory self._sys_insertion_tdir = TemporaryDirectory('plugin_unzip') self.sys_insertion_path = self._sys_insertion_tdir.__enter__(*args) zf.extractall(self.sys_insertion_path) sys.path.insert(0, self.sys_insertion_path) zf.close()
def run(self, archive): from calibre.utils.zipfile import ZipFile is_rar = archive.lower().endswith('.rar') if is_rar: from calibre.utils.unrar import extract_member, names else: zf = ZipFile(archive, 'r') if is_rar: with open(archive, 'rb') as rf: fnames = list(names(rf)) else: fnames = zf.namelist() fnames = [x for x in fnames if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db'] if is_comic(fnames): ext = '.cbr' if is_rar else '.cbz' of = self.temporary_file('_archive_extract'+ext) with open(archive, 'rb') as f: of.write(f.read()) of.close() return of.name if len(fnames) > 1 or not fnames: return archive fname = fnames[0] ext = os.path.splitext(fname)[1][1:] if ext.lower() not in ('lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb', 'azw', 'azw1', 'azw3', 'fb2'): return archive of = self.temporary_file('_archive_extract.'+ext) with closing(of): if is_rar: with open(archive, 'rb') as f: data = extract_member(f, match=None, name=fname)[1] of.write(data) else: of.write(zf.read(fname)) return of.name
def get_comic_metadata_from_cbz(self): ''' Reads the comic metadata from the comic cbz file as comictagger metadata ''' self.make_temp_cbz_file() # open the zipfile zf = ZipFile(self.file) # get cix metadata for name in zf.namelist(): if name.lower() == "comicinfo.xml": self.cix_metadata = ComicInfoXml().metadataFromString(zf.read(name)) self.zipinfo = name break # get the cbi metadata if ComicBookInfo().validateString(zf.comment): self.cbi_metadata = ComicBookInfo().metadataFromString(zf.comment) zf.close() # get combined metadata self._get_combined_metadata()
class DOCX(object): def __init__(self, path_or_stream, log=None, extract=True): stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '<stream>' self.log = log or default_log if extract: self.extract(stream) else: self.init_zipfile(stream) self.read_content_types() self.read_package_relationships() def init_zipfile(self, stream): self.zipf = ZipFile(stream) self.names = frozenset(self.zipf.namelist()) def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f def exists(self, name): return name in self.names def read(self, name): if hasattr(self, 'zipf'): return self.zipf.open(name).read() path = self.names[name] with open(path, 'rb') as f: return f.read() def read_content_types(self): try: raw = self.read('[Content_Types].xml') except KeyError: raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) root = fromstring(raw) self.content_types = {} self.default_content_types = {} for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): name = item.get('PartName').lstrip('/') self.content_types[name] = item.get('ContentType') def content_type(self, name): if name in self.content_types: return self.content_types[name] ext = name.rpartition('.')[-1].lower() if ext in self.default_content_types: return self.default_content_types[ext] return guess_type(name)[0] def read_package_relationships(self): try: raw = self.read('_rels/.rels') except KeyError: raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) root = fromstring(raw) self.relationships = {} self.relationships_rmap = {} for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): target = item.get('Target').lstrip('/') typ = item.get('Type') self.relationships[typ] = target self.relationships_rmap[target] = typ @property def document_name(self): name = self.relationships.get(DOCUMENT, None) if name is None: names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] return name @property def document(self): return fromstring(self.read(self.document_name)) @property def document_relationships(self): return self.get_relationships(self.document_name) def get_relationships(self, name): base = '/'.join(name.split('/')[:-1]) by_id, by_type = {}, {} parts = name.split('/') name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels']) try: raw = self.read(name) except KeyError: pass else: root = fromstring(raw) for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): target = item.get('Target') if item.get('TargetMode', None) != 'External' and not target.startswith('#'): target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') by_id[Id] = by_type[typ] = target return by_id, by_type @property def metadata(self): mi = Metadata(_('Unknown')) name = self.relationships.get(DOCPROPS, None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') if names: name = names[0] if name: try: raw = self.read(name) except KeyError: pass else: read_doc_props(raw, mi) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi) name = self.relationships.get(APPPROPS, None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') if names: name = names[0] if name: try: raw = self.read(name) except KeyError: pass else: read_app_props(raw, mi) return mi def close(self): if hasattr(self, 'zipf'): self.zipf.close() else: try: shutil.rmtree(self.tdir) except EnvironmentError: pass
def _get_epub_toc(self, cid=None, path=None, prepend_title=None): ''' Given a calibre id, return the epub TOC indexed by section If cid, use copy in library, else use path to copy on device ''' toc = None # if cid is not None: # mi = self.opts.gui.current_db.get_metadata(cid, index_is_id=True) # toc = None # if 'EPUB' in mi.formats: # fpath = self.opts.gui.current_db.format(cid, 'EPUB', index_is_id=True, as_path=True) # else: # return toc # elif path is not None: # fpath = path # else: # return toc fpath = path # iBooks stores books unzipped # Marvin stores books zipped # Need spine, ncx_tree to construct toc if self.ios.stat(fpath) and self.ios.stat( fpath)['st_ifmt'] == 'S_IFDIR': # Find the OPF in the unzipped ePub fp = '/'.join([fpath, 'META-INF', 'container.xml']) cf = io.BytesIO(self.ios.read(fp)) container = etree.parse(cf) opf_file = container.xpath('.//*[local-name()="rootfile"]')[0].get( 'full-path') oebps = opf_file.rpartition('/')[0] fp = '/'.join([fpath, opf_file]) opf = io.BytesIO(self.ios.read(fp)) opf_tree = etree.parse(opf) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx_file = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') fp = '/'.join([fpath, oebps, ncx_file]) ncxf = io.BytesIO(self.ios.read(fp)) ncx_tree = etree.parse(ncxf) #self._log(etree.tostring(ncx_tree, pretty_print=True)) else: # Find the OPF file in the zipped ePub zfo = io.BytesIO(self.ios.read(fpath, mode='rb')) try: zf = ZipFile(zfo, 'r') container = etree.fromstring(zf.read('META-INF/container.xml')) opf_tree = etree.fromstring( zf.read( container.xpath('.//*[local-name()="rootfile"]') [0].get('full-path'))) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') # Find the ncx file fnames = zf.namelist() _ncx = [x for x in fnames if ncx in x][0] ncx_tree = etree.fromstring(zf.read(_ncx)) except: import traceback self._log_location() self._log(" unable to unzip '%s'" % fpath) self._log(traceback.format_exc()) return toc # fpath points to epub (zipped or unzipped dir) # spine, ncx_tree populated try: toc = OrderedDict() # 1. capture idrefs from spine for i, el in enumerate(spine): toc[str(i)] = el.get('idref') # 2. Resolve <spine> idrefs to <manifest> hrefs for el in toc: toc[el] = manifest.find('.//*[@id="%s"]' % toc[el]).get('href') # 3. Build a dict of src:toc_entry src_map = OrderedDict() navMap = ncx_tree.xpath('.//*[local-name()="navMap"]')[0] for navPoint in navMap: # Get the first-level entry src = re.sub( r'#.*$', '', navPoint.xpath('.//*[local-name()="content"]')[0].get( 'src')) toc_entry = navPoint.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Get any nested navPoints nested_navPts = navPoint.xpath('.//*[local-name()="navPoint"]') for nnp in nested_navPts: src = re.sub( r'#.*$', '', nnp.xpath('.//*[local-name()="content"]')[0].get( 'src')) toc_entry = nnp.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Resolve src paths to toc_entry for section in toc: if toc[section] in src_map: if prepend_title: toc[section] = "%s · %s" % ( prepend_title, src_map[toc[section]]) else: toc[section] = src_map[toc[section]] else: toc[section] = None # 5. Fill in the gaps current_toc_entry = None for section in toc: if toc[section] is None: toc[section] = current_toc_entry else: current_toc_entry = toc[section] except: import traceback self._log_location() self._log("{:~^80}".format(" error parsing '%s' " % fpath)) self._log(traceback.format_exc()) self._log("{:~^80}".format(" end traceback ")) return toc
class DOCX(object): def __init__(self, path_or_stream, log=None, extract=True): self.docx_is_transitional = True stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '<stream>' self.log = log or default_log if extract: self.extract(stream) else: self.init_zipfile(stream) self.read_content_types() self.read_package_relationships() self.namespace = DOCXNamespace(self.docx_is_transitional) def init_zipfile(self, stream): self.zipf = ZipFile(stream) self.names = frozenset(self.zipf.namelist()) def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f def exists(self, name): return name in self.names def read(self, name): if hasattr(self, 'zipf'): return self.zipf.open(name).read() path = self.names[name] with open(path, 'rb') as f: return f.read() def read_content_types(self): try: raw = self.read('[Content_Types].xml') except KeyError: raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) root = fromstring(raw) self.content_types = {} self.default_content_types = {} for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): name = item.get('PartName').lstrip('/') self.content_types[name] = item.get('ContentType') def content_type(self, name): if name in self.content_types: return self.content_types[name] ext = name.rpartition('.')[-1].lower() if ext in self.default_content_types: return self.default_content_types[ext] return guess_type(name)[0] def read_package_relationships(self): try: raw = self.read('_rels/.rels') except KeyError: raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) root = fromstring(raw) self.relationships = {} self.relationships_rmap = {} for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): target = item.get('Target').lstrip('/') typ = item.get('Type') if target == 'word/document.xml': self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument' self.relationships[typ] = target self.relationships_rmap[target] = typ @property def document_name(self): name = self.relationships.get(self.namespace.names['DOCUMENT'], None) if name is None: names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) if not names: raise InvalidDOCX('The file %s docx file has no main document' % self.name) name = names[0] return name @property def document(self): return fromstring(self.read(self.document_name)) @property def document_relationships(self): return self.get_relationships(self.document_name) def get_relationships(self, name): base = '/'.join(name.split('/')[:-1]) by_id, by_type = {}, {} parts = name.split('/') name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels']) try: raw = self.read(name) except KeyError: pass else: root = fromstring(raw) for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): target = item.get('Target') if item.get('TargetMode', None) != 'External' and not target.startswith('#'): target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') by_id[Id] = by_type[typ] = target return by_id, by_type def get_document_properties_names(self): name = self.relationships.get(self.namespace.names['DOCPROPS'], None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') if names: name = names[0] yield name name = self.relationships.get(self.namespace.names['APPPROPS'], None) if name is None: names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') if names: name = names[0] yield name @property def metadata(self): mi = Metadata(_('Unknown')) dp_name, ap_name = self.get_document_properties_names() if dp_name: try: raw = self.read(dp_name) except KeyError: pass else: read_doc_props(raw, mi, self.namespace.XPath) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi, self.namespace.XPath) ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) if ap_name: try: raw = self.read(ap_name) except KeyError: pass else: read_app_props(raw, mi) return mi def close(self): if hasattr(self, 'zipf'): self.zipf.close() else: try: shutil.rmtree(self.tdir) except EnvironmentError: pass
def _get_epub_toc(self, cid=None, path=None, prepend_title=None): ''' Given a calibre id, return the epub TOC indexed by section If cid, use copy in library, else use path to copy on device ''' toc = None # if cid is not None: # mi = self.opts.gui.current_db.get_metadata(cid, index_is_id=True) # toc = None # if 'EPUB' in mi.formats: # fpath = self.opts.gui.current_db.format(cid, 'EPUB', index_is_id=True, as_path=True) # else: # return toc # elif path is not None: # fpath = path # else: # return toc fpath = path # iBooks stores books unzipped # Marvin stores books zipped # Need spine, ncx_tree to construct toc if self.ios.stat(fpath) and self.ios.stat(fpath)['st_ifmt'] == 'S_IFDIR': # Find the OPF in the unzipped ePub fp = '/'.join([fpath, 'META-INF', 'container.xml']) cf = cStringIO.StringIO(self.ios.read(fp)) container = etree.parse(cf) opf_file = container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path') oebps = opf_file.rpartition('/')[0] fp = '/'.join([fpath, opf_file]) opf = cStringIO.StringIO(self.ios.read(fp)) opf_tree = etree.parse(opf) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx_file = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') fp = '/'.join([fpath, oebps, ncx_file]) ncxf = cStringIO.StringIO(self.ios.read(fp)) ncx_tree = etree.parse(ncxf) #self._log(etree.tostring(ncx_tree, pretty_print=True)) else: # Find the OPF file in the zipped ePub zfo = cStringIO.StringIO(self.ios.read(fpath, mode='rb')) try: zf = ZipFile(zfo, 'r') container = etree.fromstring(zf.read('META-INF/container.xml')) opf_tree = etree.fromstring(zf.read(container.xpath('.//*[local-name()="rootfile"]')[0].get('full-path'))) spine = opf_tree.xpath('.//*[local-name()="spine"]')[0] ncx_fs = spine.get('toc') manifest = opf_tree.xpath('.//*[local-name()="manifest"]')[0] ncx = manifest.find('.//*[@id="%s"]' % ncx_fs).get('href') # Find the ncx file fnames = zf.namelist() _ncx = [x for x in fnames if ncx in x][0] ncx_tree = etree.fromstring(zf.read(_ncx)) except: import traceback self._log_location() self._log(" unable to unzip '%s'" % fpath) self._log(traceback.format_exc()) return toc # fpath points to epub (zipped or unzipped dir) # spine, ncx_tree populated try: toc = OrderedDict() # 1. capture idrefs from spine for i, el in enumerate(spine): toc[str(i)] = el.get('idref') # 2. Resolve <spine> idrefs to <manifest> hrefs for el in toc: toc[el] = manifest.find('.//*[@id="%s"]' % toc[el]).get('href') # 3. Build a dict of src:toc_entry src_map = OrderedDict() navMap = ncx_tree.xpath('.//*[local-name()="navMap"]')[0] for navPoint in navMap: # Get the first-level entry src = re.sub(r'#.*$', '', navPoint.xpath('.//*[local-name()="content"]')[0].get('src')) toc_entry = navPoint.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Get any nested navPoints nested_navPts = navPoint.xpath('.//*[local-name()="navPoint"]') for nnp in nested_navPts: src = re.sub(r'#.*$', '', nnp.xpath('.//*[local-name()="content"]')[0].get('src')) toc_entry = nnp.xpath('.//*[local-name()="text"]')[0].text src_map[src] = toc_entry # Resolve src paths to toc_entry for section in toc: if toc[section] in src_map: if prepend_title: toc[section] = "%s · %s" % (prepend_title, src_map[toc[section]]) else: toc[section] = src_map[toc[section]] else: toc[section] = None # 5. Fill in the gaps current_toc_entry = None for section in toc: if toc[section] is None: toc[section] = current_toc_entry else: current_toc_entry = toc[section] except: import traceback self._log_location() self._log("{:~^80}".format(" error parsing '%s' " % fpath)) self._log(traceback.format_exc()) self._log("{:~^80}".format(" end traceback ")) return toc