def __init__(self, pathtoepub, log, clone_data=None, tdir=None): if clone_data is not None: super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data) for x in ('pathtoepub', 'obfuscated_fonts'): setattr(self, x, clone_data[x]) return self.pathtoepub = pathtoepub if tdir is None: tdir = PersistentTemporaryDirectory('_epub_container') tdir = os.path.abspath(os.path.realpath(tdir)) self.root = tdir with open(self.pathtoepub, 'rb') as stream: try: zf = ZipFile(stream) zf.extractall(tdir) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) try: os.remove(join(tdir, 'mimetype')) except EnvironmentError: pass container_path = join(self.root, 'META-INF', 'container.xml') if not exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') container = etree.fromstring(open(container_path, 'rb').read()) opf_files = container.xpath( (r'child::ocf:rootfiles/ocf:rootfile' '[@media-type="%s" and @full-path]' % guess_type('a.opf')), namespaces={'ocf': OCF_NS}) if not opf_files: raise InvalidEpub( 'META-INF/container.xml contains no link to OPF file') opf_path = os.path.join( self.root, *(urlunquote(opf_files[0].get('full-path')).split('/'))) if not exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to' ' by META-INF/container.xml') super(EpubContainer, self).__init__(tdir, opf_path, log) self.obfuscated_fonts = {} if 'META-INF/encryption.xml' in self.name_path_map: self.process_encryption() self.parsed_cache['META-INF/container.xml'] = container
def __init__(self, pathtoepub, log, clone_data=None, tdir=None): if clone_data is not None: super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data) for x in ('pathtoepub', 'obfuscated_fonts'): setattr(self, x, clone_data[x]) return self.pathtoepub = pathtoepub if tdir is None: tdir = PersistentTemporaryDirectory('_epub_container') tdir = os.path.abspath(os.path.realpath(tdir)) self.root = tdir with open(self.pathtoepub, 'rb') as stream: try: zf = ZipFile(stream) zf.extractall(tdir) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) try: os.remove(join(tdir, 'mimetype')) except EnvironmentError: pass container_path = join(self.root, 'META-INF', 'container.xml') if not exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') container = etree.fromstring(open(container_path, 'rb').read()) opf_files = container.xpath(( r'child::ocf:rootfiles/ocf:rootfile' '[@media-type="%s" and @full-path]'%guess_type('a.opf') ), namespaces={'ocf':OCF_NS} ) if not opf_files: raise InvalidEpub('META-INF/container.xml contains no link to OPF file') opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/'))) if not exists(opf_path): raise InvalidEpub('OPF file does not exist at location pointed to' ' by META-INF/container.xml') super(EpubContainer, self).__init__(tdir, opf_path, log) self.obfuscated_fonts = {} if 'META-INF/encryption.xml' in self.name_path_map: self.process_encryption() self.parsed_cache['META-INF/container.xml'] = container
def extract(self): if self.path.lower().endswith('.zip'): from calibre.utils.zipfile import ZipFile try: with ZipFile(self.path) as zf: zf.extractall(self.tdir) except Exception: prints('Corrupt ZIP file, trying to use local headers') from calibre.utils.localunzip import extractall extractall(self.path, self.tdir) elif self.path.lower().endswith('.rar'): from calibre.utils.unrar import extract extract(self.path, self.tdir) else: raise ValueError('Can only process ZIP or RAR archives')
def extract(source): tdir = tempfile.mkdtemp(suffix='_archive', dir=self.tdir) if source.lower().endswith('.zip'): from calibre.utils.zipfile import ZipFile try: with ZipFile(source) as zf: zf.extractall(tdir) except Exception: prints('Corrupt ZIP file, trying to use local headers') from calibre.utils.localunzip import extractall extractall(source, tdir) elif source.lower().endswith('.rar'): from calibre.utils.unrar import extract extract(source, tdir) return tdir
def extract(self, stream): self.tdir = PersistentTemporaryDirectory('docx_container') try: zf = ZipFile(stream) zf.extractall(self.tdir) except: self.log.exception('DOCX appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) self.names = {} for f in walk(self.tdir): name = os.path.relpath(f, self.tdir).replace(os.sep, '/') self.names[name] = f
def __init__(self, pathtoepub, log, clone_data=None, tdir=None): if clone_data is not None: super(EpubContainer, self).__init__(None, None, log, clone_data=clone_data) for x in ("pathtoepub", "obfuscated_fonts"): setattr(self, x, clone_data[x]) return self.pathtoepub = pathtoepub if tdir is None: tdir = PersistentTemporaryDirectory("_epub_container") tdir = os.path.abspath(os.path.realpath(tdir)) self.root = tdir with open(self.pathtoepub, "rb") as stream: try: zf = ZipFile(stream) zf.extractall(tdir) except: log.exception("EPUB appears to be invalid ZIP file, trying a" " more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) try: os.remove(join(tdir, "mimetype")) except EnvironmentError: pass container_path = join(self.root, "META-INF", "container.xml") if not exists(container_path): raise InvalidEpub("No META-INF/container.xml in epub") container = etree.fromstring(open(container_path, "rb").read()) opf_files = container.xpath( (r"child::ocf:rootfiles/ocf:rootfile" '[@media-type="%s" and @full-path]' % guess_type("a.opf")), namespaces={"ocf": OCF_NS}, ) if not opf_files: raise InvalidEpub("META-INF/container.xml contains no link to OPF file") opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get("full-path")).split("/"))) if not exists(opf_path): raise InvalidEpub("OPF file does not exist at location pointed to" " by META-INF/container.xml") super(EpubContainer, self).__init__(tdir, opf_path, log) self.obfuscated_fonts = {} if "META-INF/encryption.xml" in self.name_path_map: self.process_encryption() self.parsed_cache["META-INF/container.xml"] = container
def download_and_extract(self, uri): ''' Downloads a zip file and extracts it ''' tdir = tempfile.mkdtemp(suffix='_archive', dir=self.tdir) response = self._get({}, uri) the_zip = response.read() io = StringIO.StringIO() io.write(the_zip) # now extract from calibre.utils.zipfile import ZipFile try: with ZipFile(io) as zf: zf.extractall(tdir) except Exception: print('Corrupt ZIP file, trying to use local headers') from calibre.utils.localunzip import extractall extractall(io, tdir) return tdir
def convert(self, stream, options, file_ext, log, accelerators): """Convert a KePub file into a structure calibre can process.""" log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() zf.extractall(cwd) except Exception: log.exception("KEPUB appears to be invalid ZIP file, trying a " "more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk("."): if (f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith(".")): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError( _( # noqa: F821 "{0} is not a valid KEPUB file (could not find opf)"). format(path)) encfile = os.path.abspath("rights.xml") if os.path.exists(encfile): raise DRMError(os.path.basename(path)) cwd = os.getcwdu() if sys.version_info.major == 2 else os.getcwd() opf = os.path.relpath(opf, cwd) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) f = (self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2) self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError( _("EPUB files with DTBook markup are not supported" ) # noqa: F821 ) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in { "application/vnd.adobe-page-template+xml", "application/vnd.adobe.page-template+xml", "application/adobe-page-template+xml", "application/adobe.page-template+xml", "application/text", }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError( _("No valid entries in the spine of this EPUB") # noqa: F821 ) with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath("content.opf")
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( '%s is not a valid EPUB file (could not find opf)' % path) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' def normpath(x): return posixpath.normpath(delta + elem.get('href')) for elem in opf.itermanifest(): elem.set('href', normpath(elem.get('href'))) for elem in opf.iterguide(): elem.set('href', normpath(elem.get('href'))) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover, ) epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log, options) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_: mt = y.get('media-type', None) if mt in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) ext = y.get('href', '').rpartition('.')[-1].lower() if mt == 'text/plain' and ext in {'otf', 'ttf'}: # some epub authoring software sets font mime types to # text/plain not_for_spine.add(id_) y.set('media-type', 'application/font') seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('EPUB appears to be invalid ZIP file, trying a' ' more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml')) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris epub3_nav = opf.epub3_nav if epub3_nav is not None: self.convert_epub3_nav(epub3_nav, opf, log) if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1])+'/' for elem in opf.itermanifest(): elem.set('href', delta+elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta+elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 self.removed_cover = f(opf, log) for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( 'EPUB files with DTBook markup are not supported') not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text'}: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError('No valid entries in the spine of this EPUB') with lopen('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')
def convert(self, stream, options, file_ext, log, accelerators): from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception("EPUB appears to be invalid ZIP file, trying a" " more forgiving ZIP parser") from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) encfile = os.path.abspath(os.path.join("META-INF", "encryption.xml")) opf = self.find_opf() if opf is None: for f in walk(u"."): if f.lower().endswith(".opf") and "__MACOSX" not in f and not os.path.basename(f).startswith("."): opf = os.path.abspath(f) break path = getattr(stream, "name", "stream") if opf is None: raise ValueError("%s is not a valid EPUB file (could not find opf)" % path) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): if not self.process_encryption(encfile, opf, log): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris if len(parts) > 1 and parts[0]: delta = "/".join(parts[:-1]) + "/" for elem in opf.itermanifest(): elem.set("href", delta + elem.get("href")) for elem in opf.iterguide(): elem.set("href", delta + elem.get("href")) self.removed_cover = self.rationalize_cover(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get("media-type", "") == "application/x-dtbook+xml": raise ValueError("EPUB files with DTBook markup are not supported") not_for_spine = set() for y in opf.itermanifest(): id_ = y.get("id", None) if id_ and y.get("media-type", None) in ("application/vnd.adobe-page-template+xml", "application/text"): not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get("idref", None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError("No valid entries in the spine of this EPUB") with open("content.opf", "wb") as nopf: nopf.write(opf.render()) return os.path.abspath(u"content.opf")
def convert(self, stream, options, file_ext, log, accelerators): log("KEPUBInput::convert - start") from calibre.utils.zipfile import ZipFile from calibre import walk from calibre.ebooks import DRMError from calibre.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwdu()) except: log.exception('KEPUB appears to be invalid ZIP file, trying a ' 'more forgiving ZIP parser') from calibre.utils.localunzip import extractall stream.seek(0) extractall(stream) opf = self.find_opf() if opf is None: for f in walk(u'.'): if f.lower().endswith('.opf') and '__MACOSX' not in f and \ not os.path.basename(f).startswith('.'): opf = os.path.abspath(f) break path = getattr(stream, 'name', 'stream') if opf is None: raise ValueError( _('%s is not a valid KEPUB file (could not find opf)') % path) encfile = os.path.abspath('rights.xml') if os.path.exists(encfile): raise DRMError(os.path.basename(path)) opf = os.path.relpath(opf, os.getcwdu()) parts = os.path.split(opf) opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) self.encrypted_fonts = [] if len(parts) > 1 and parts[0]: delta = '/'.join(parts[:-1]) + '/' for elem in opf.itermanifest(): elem.set('href', delta + elem.get('href')) for elem in opf.iterguide(): elem.set('href', delta + elem.get('href')) f = self.rationalize_cover3 if opf.package_version >= 3.0 else \ self.rationalize_cover2 self.removed_cover = f(opf, log) self.optimize_opf_parsing = opf for x in opf.itermanifest(): if x.get('media-type', '') == 'application/x-dtbook+xml': raise ValueError( _('EPUB files with DTBook markup are not supported')) not_for_spine = set() for y in opf.itermanifest(): id_ = y.get('id', None) if id_ and y.get('media-type', None) in { 'application/vnd.adobe-page-template+xml', 'application/vnd.adobe.page-template+xml', 'application/adobe-page-template+xml', 'application/adobe.page-template+xml', 'application/text' }: not_for_spine.add(id_) seen = set() for x in list(opf.iterspine()): ref = x.get('idref', None) if not ref or ref in not_for_spine or ref in seen: x.getparent().remove(x) continue seen.add(ref) if len(list(opf.iterspine())) == 0: raise ValueError(_('No valid entries in the spine of this EPUB')) with open('content.opf', 'wb') as nopf: nopf.write(opf.render()) return os.path.abspath(u'content.opf')