def removeCDPwatermark(object, path_to_ebook): # "META-INF/cdp.info" is a watermark file used by some Tolino vendors. # We don't want that in our eBooks, so lets remove that file. try: infile = ZipFile(open(path_to_ebook, 'rb')) namelist = infile.namelist() if 'META-INF/cdp.info' not in namelist: return path_to_ebook namelist.remove("mimetype") namelist.remove("META-INF/cdp.info") output = object.temporary_file(".epub").name kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: for path in (["mimetype"] + namelist): data = infile.read(path) zi = ZipInfo(path) oldzi = infile.getinfo(path) try: zi.compress_type = oldzi.compress_type if path == "mimetype": zi.compress_type = ZIP_STORED zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass outf.writestr(zi, data) print("Watermark: Successfully removed cdp.info watermark") return output except: traceback.print_exc() return path_to_ebook
def test_parseextra(self): info = ZipInfo("foo") info.extra = struct.pack('<HHBl', 0x5455, 5, 1, 978307200) extra = Archive.parse_extra(info) assert_true(0x5455 in extra) assert_equal(extra[0x5455], struct.pack('<Bl', 1, 978307200))
def decryptBook(userkey, inpath, outpath): if AES is None: raise ADEPTError(u"PyCrypto or OpenSSL must be installed.") rsa = RSA(userkey) with closing(ZipFile(open(inpath, 'rb'))) as inf: namelist = set(inf.namelist()) if 'META-INF/rights.xml' not in namelist or \ 'META-INF/encryption.xml' not in namelist: print u"{0:s} is DRM-free.".format(os.path.basename(inpath)) return 1 for name in META_NAMES: namelist.remove(name) try: rights = etree.fromstring(inf.read('META-INF/rights.xml')) adept = lambda tag: '{%s}%s' % (NSMAP['adept'], tag) expr = './/%s' % (adept('encryptedKey'),) bookkey = ''.join(rights.findtext(expr)) if len(bookkey) != 172: print u"{0:s} is not a secure Adobe Adept ePub.".format(os.path.basename(inpath)) return 1 bookkey = rsa.decrypt(bookkey.decode('base64')) # Padded as per RSAES-PKCS1-v1_5 if bookkey[-17] != '\x00': print u"Could not decrypt {0:s}. Wrong key".format(os.path.basename(inpath)) return 2 encryption = inf.read('META-INF/encryption.xml') decryptor = Decryptor(bookkey[-16:], encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(outpath, 'wb'), 'w', **kwds)) as outf: zi = ZipInfo('mimetype') zi.compress_type=ZIP_STORED try: # if the mimetype is present, get its info, including time-stamp oldzi = inf.getinfo('mimetype') # copy across fields to be preserved zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass outf.writestr(zi, inf.read('mimetype')) for path in namelist: data = inf.read(path) zi = ZipInfo(path) zi.compress_type=ZIP_DEFLATED try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass outf.writestr(zi, decryptor.decrypt(path, data)) except: print u"Could not decrypt {0:s} because of an exception:\n{1:s}".format(os.path.basename(inpath), traceback.format_exc()) return 2 return 0
def decryptFontsBook(inpath, outpath): with closing(ZipFile(open(inpath, 'rb'))) as inf: namelist = inf.namelist() if 'META-INF/encryption.xml' not in namelist: return 1 # Font key handling: font_master_key = None adobe_master_encryption_key = None contNS = lambda tag: '{%s}%s' % ( 'urn:oasis:names:tc:opendocument:xmlns:container', tag) path = None try: container = etree.fromstring(inf.read("META-INF/container.xml")) rootfiles = container.find(contNS("rootfiles")).findall( contNS("rootfile")) for rootfile in rootfiles: path = rootfile.get("full-path", None) if (path is not None): break except: pass # If path is None, we didn't find an OPF, so we probably don't have a font key. # If path is set, it's the path to the main content OPF file. if (path is None): print("FontDecrypt: No OPF for font obfuscation found") return 1 else: packageNS = lambda tag: '{%s}%s' % ('http://www.idpf.org/2007/opf', tag) metadataDCNS = lambda tag: '{%s}%s' % ( 'http://purl.org/dc/elements/1.1/', tag) try: container = etree.fromstring(inf.read(path)) except: container = [] ## IETF font key algorithm: print( "FontDecrypt: Checking {0} for IETF font obfuscation keys ... " .format(path), end='') secret_key_name = None try: secret_key_name = container.get("unique-identifier") except: pass try: identify_element = container.find(packageNS("metadata")).find( metadataDCNS("identifier")) if (secret_key_name is None or secret_key_name == identify_element.get("id")): font_master_key = identify_element.text except: pass if (font_master_key is not None): if (secret_key_name is None): print("found '%s'" % (font_master_key)) else: print("found '%s' (%s)" % (font_master_key, secret_key_name)) # Trim / remove forbidden characters from the key, then hash it: font_master_key = font_master_key.replace(' ', '') font_master_key = font_master_key.replace('\t', '') font_master_key = font_master_key.replace('\r', '') font_master_key = font_master_key.replace('\n', '') font_master_key = font_master_key.encode('utf-8') font_master_key = hashlib.sha1(font_master_key).digest() else: print("not found") ## Adobe font key algorithm print( "FontDecrypt: Checking {0} for Adobe font obfuscation keys ... " .format(path), end='') try: metadata = container.find(packageNS("metadata")) identifiers = metadata.findall(metadataDCNS("identifier")) uid = None uidMalformed = False for identifier in identifiers: if identifier.get(packageNS("scheme")) == "UUID": if identifier.text[:9] == "urn:uuid:": uid = identifier.text[9:] else: uid = identifier.text break if identifier.text[:9] == "urn:uuid:": uid = identifier.text[9:] break if uid is not None: uid = uid.replace(chr(0x20), '').replace(chr(0x09), '') uid = uid.replace(chr(0x0D), '').replace(chr(0x0A), '').replace('-', '') if len(uid) < 16: uidMalformed = True if not all(c in "0123456789abcdefABCDEF" for c in uid): uidMalformed = True if not uidMalformed: print("found '{0}'".format(uid)) uid = uid + uid adobe_master_encryption_key = binascii.unhexlify( uid[:32]) if adobe_master_encryption_key is None: print("not found") except: print("exception") pass # Begin decrypting. try: encryption = inf.read('META-INF/encryption.xml') decryptor = Decryptor(font_master_key, adobe_master_encryption_key, encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(outpath, 'wb'), 'w', **kwds)) as outf: # Mimetype needs to be the first entry, so remove it from the list # whereever it is, then add it at the beginning. namelist.remove("mimetype") for path in (["mimetype"] + namelist): data = inf.read(path) zi = ZipInfo(path) zi.compress_type = ZIP_DEFLATED if path == "mimetype": # mimetype must not be compressed zi.compress_type = ZIP_STORED elif path == "META-INF/encryption.xml": # Check if there's still other entries not related to fonts if (decryptor.check_if_remaining()): data = decryptor.get_xml() print( "FontDecrypt: There's remaining entries in encryption.xml, adding file ..." ) else: # No remaining entries, no need for that file. continue try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass if path == "mimetype": outf.writestr(zi, inf.read('mimetype')) elif path == "META-INF/encryption.xml": outf.writestr(zi, data) else: outf.writestr(zi, decryptor.decrypt(path, data)) except: print( "FontDecrypt: Could not decrypt fonts in {0:s} because of an exception:\n{1:s}" .format(os.path.basename(inpath), traceback.format_exc())) traceback.print_exc() return 2 return 0
def decryptBook(keyb64, inpath, outpath): if AES is None: raise IGNOBLEError(u"PyCrypto or OpenSSL must be installed.") key = keyb64.decode("base64")[:16] aes = AES(key) with closing(ZipFile(open(inpath, "rb"))) as inf: namelist = set(inf.namelist()) if "META-INF/rights.xml" not in namelist or "META-INF/encryption.xml" not in namelist: print u"{0:s} is DRM-free.".format(os.path.basename(inpath)) return 1 for name in META_NAMES: namelist.remove(name) try: rights = etree.fromstring(inf.read("META-INF/rights.xml")) adept = lambda tag: "{%s}%s" % (NSMAP["adept"], tag) expr = ".//%s" % (adept("encryptedKey"),) bookkey = "".join(rights.findtext(expr)) if len(bookkey) != 64: print u"{0:s} is not a secure Barnes & Noble ePub.".format(os.path.basename(inpath)) return 1 bookkey = aes.decrypt(bookkey.decode("base64")) bookkey = bookkey[: -ord(bookkey[-1])] encryption = inf.read("META-INF/encryption.xml") decryptor = Decryptor(bookkey[-16:], encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(outpath, "wb"), "w", **kwds)) as outf: zi = ZipInfo("mimetype") zi.compress_type = ZIP_STORED try: # if the mimetype is present, get its info, including time-stamp oldzi = inf.getinfo("mimetype") # copy across fields to be preserved zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass outf.writestr(zi, inf.read("mimetype")) for path in namelist: data = inf.read(path) zi = ZipInfo(path) zi.compress_type = ZIP_DEFLATED try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass outf.writestr(zi, decryptor.decrypt(path, data)) except: print u"Could not decrypt {0:s} because of an exception:\n{1:s}".format( os.path.basename(inpath), traceback.format_exc() ) return 2 return 0
def decryptBook(userkey, inpath, outpath): with closing(ZipFile(open(inpath, 'rb'))) as inf: namelist = inf.namelist() if 'META-INF/rights.xml' not in namelist or \ 'META-INF/encryption.xml' not in namelist: print("{0:s} is DRM-free.".format(os.path.basename(inpath))) return 1 for name in META_NAMES: namelist.remove(name) try: rights = etree.fromstring(inf.read('META-INF/rights.xml')) adept = lambda tag: '{%s}%s' % (NSMAP['adept'], tag) expr = './/%s' % (adept('encryptedKey'), ) bookkeyelem = rights.find(expr) bookkey = bookkeyelem.text keytype = bookkeyelem.attrib.get('keyType', '0') if len(bookkey) >= 172 and int(keytype, 10) > 2: print("{0:s} is a secure Adobe Adept ePub with hardening.". format(os.path.basename(inpath))) elif len(bookkey) == 172: print("{0:s} is a secure Adobe Adept ePub.".format( os.path.basename(inpath))) elif len(bookkey) == 64: print("{0:s} is a secure Adobe PassHash (B&N) ePub.".format( os.path.basename(inpath))) else: print("{0:s} is not an Adobe-protected ePub!".format( os.path.basename(inpath))) return 1 if len(bookkey) != 64: # Normal or "hardened" Adobe ADEPT rsakey = RSA.import_key(userkey) # parses the ASN1 structure bookkey = base64.b64decode(bookkey) if int(keytype, 10) > 2: bookkey = removeHardening(rights, keytype, bookkey) try: bookkey = PKCS1_v1_5.new(rsakey).decrypt( bookkey, None) # automatically unpads except ValueError: bookkey = None if bookkey is None: print("Could not decrypt {0:s}. Wrong key".format( os.path.basename(inpath))) return 2 else: # Adobe PassHash / B&N key = base64.b64decode(userkey)[:16] bookkey = base64.b64decode(bookkey) bookkey = unpad( AES.new(key, AES.MODE_CBC, b'\x00' * 16).decrypt(bookkey), 16) # PKCS#7 if len(bookkey) > 16: bookkey = bookkey[-16:] encryption = inf.read('META-INF/encryption.xml') decryptor = Decryptor(bookkey, encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(outpath, 'wb'), 'w', **kwds)) as outf: for path in (["mimetype"] + namelist): data = inf.read(path) zi = ZipInfo(path) zi.compress_type = ZIP_DEFLATED if path == "mimetype": zi.compress_type = ZIP_STORED elif path == "META-INF/encryption.xml": # Check if there's still something in there if (decryptor.check_if_remaining()): data = decryptor.get_xml() print( "Adding encryption.xml for the remaining embedded files." ) # We removed DRM, but there's still stuff like obfuscated fonts. else: continue try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass if path == "META-INF/encryption.xml": outf.writestr(zi, data) else: outf.writestr(zi, decryptor.decrypt(path, data)) except: print("Could not decrypt {0:s} because of an exception:\n{1:s}". format(os.path.basename(inpath), traceback.format_exc())) return 2 return 0
def removeOPFwatermarks(object, path_to_ebook): contNS = lambda tag: '{%s}%s' % ( 'urn:oasis:names:tc:opendocument:xmlns:container', tag) opf_path = None try: inf = ZipFile(open(path_to_ebook, 'rb')) container = etree.fromstring(inf.read("META-INF/container.xml")) rootfiles = container.find(contNS("rootfiles")).findall( contNS("rootfile")) for rootfile in rootfiles: opf_path = rootfile.get("full-path", None) if (opf_path is not None): break except: traceback.print_exc() return path_to_ebook # If path is None, we didn't find an OPF, so we probably don't have a font key. # If path is set, it's the path to the main content OPF file. if (opf_path is None): # No OPF found - no watermark return path_to_ebook else: try: container_str = inf.read(opf_path).decode("utf-8") container_str_new = container_str had_amazon = False had_elibri = False # Remove Amazon hex watermarks # Match optional newline at the beginning, then spaces, then a "meta" tag with name = "Watermark" or "Watermark_(hex)" and a "content" element. # This regex also matches DuMont watermarks with meta name="watermark", with the case-insensitive match on the "w" in watermark. pre_remove = container_str_new container_str_new = re.sub( r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"[Ww]atermark(_\(hex\))?\"\s+content=\"[0-9a-fA-F]+\"\s*\/>', '', container_str_new) container_str_new = re.sub( r'((\r\n|\r|\n)\s*)?\<meta\s+content=\"[0-9a-fA-F]+\"\s+name=\"[Ww]atermark(_\(hex\))?\"\s*\/>', '', container_str_new) if pre_remove != container_str_new: had_amazon = True # Remove elibri / lemonink watermark # Lemonink replaces all "id" fields in the opf with "idX_Y", with X being the watermark and Y being a number for that particular ID. # This regex replaces all "idX_Y" IDs with "id_Y", removing the watermark IDs. pre_remove = container_str_new container_str_new = re.sub( r'((\r\n|\r|\n)\s*)?\<\!\-\-\s*Wygenerowane przez elibri dla zamówienia numer [0-9a-fA-F]+\s*\-\-\>', '', container_str_new) if pre_remove != container_str_new: # To prevent this Regex from applying to books without that watermark, only do that if the watermark above was found. container_str_new = re.sub(r'\=\"id[0-9]+_([0-9]+)\"', r'="id_\1"', container_str_new) if pre_remove != container_str_new: had_elibri = True except: traceback.print_exc() return path_to_ebook if (container_str == container_str_new): # container didn't change - no watermark return path_to_ebook # Re-package without watermark namelist = inf.namelist() namelist.remove("mimetype") try: output = object.temporary_file(".epub").name kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: for path in (["mimetype"] + namelist): data = inf.read(path) if path == opf_path: # Found OPF, replacing ... data = container_str_new zi = ZipInfo(path) oldzi = inf.getinfo(path) try: zi.compress_type = oldzi.compress_type if path == "mimetype": zi.compress_type = ZIP_STORED zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass outf.writestr(zi, data) except: traceback.print_exc() return path_to_ebook if had_elibri: print( "Watermark: Successfully stripped eLibri watermark from OPF file." ) if had_amazon: print( "Watermark: Successfully stripped Amazon watermark from OPF file." ) return output
def decryptBook(userkey, inpath, outpath): if AES is None: raise ADEPTError("PyCrypto or OpenSSL must be installed.") with closing(ZipFile(open(inpath, 'rb'))) as inf: namelist = inf.namelist() if 'META-INF/rights.xml' not in namelist or \ 'META-INF/encryption.xml' not in namelist: print("{0:s} is DRM-free.".format(os.path.basename(inpath))) return 1 for name in META_NAMES: namelist.remove(name) try: rights = etree.fromstring(inf.read('META-INF/rights.xml')) adept = lambda tag: '{%s}%s' % (NSMAP['adept'], tag) expr = './/%s' % (adept('encryptedKey'), ) bookkey = ''.join(rights.findtext(expr)) if len(bookkey) == 192: print( "{0:s} seems to be an Adobe ADEPT ePub with Adobe's new DRM" .format(os.path.basename(inpath))) print("This DRM cannot be removed yet. ") print( "Try getting your distributor to give you a new ACSM file, then open that in an old version of ADE (2.0)." ) print( "If your book distributor is not enforcing the new DRM yet, this will give you a copy with the old DRM." ) raise ADEPTNewVersionError("Book uses new ADEPT encryption") if len(bookkey) == 172: print("{0:s} is a secure Adobe Adept ePub.".format( os.path.basename(inpath))) elif len(bookkey) == 64: print("{0:s} is a secure Adobe PassHash (B&N) ePub.".format( os.path.basename(inpath))) else: print("{0:s} is not an Adobe-protected ePub!".format( os.path.basename(inpath))) return 1 if len(bookkey) != 64: # Normal Adobe ADEPT rsa = RSA(userkey) bookkey = rsa.decrypt(base64.b64decode( bookkey.encode('ascii'))) else: # Adobe PassHash / B&N key = base64.b64decode(userkey)[:16] aes = AES(key) bookkey = aes.decrypt(base64.b64decode(bookkey)) if type(bookkey[-1]) != int: pad = ord(bookkey[-1]) else: pad = bookkey[-1] bookkey = bookkey[:-pad] # Padded as per RSAES-PKCS1-v1_5 if len(bookkey) > 16: if verify_book_key(bookkey): bookkey = bookkey[-16:] else: print("Could not decrypt {0:s}. Wrong key".format( os.path.basename(inpath))) return 2 encryption = inf.read('META-INF/encryption.xml') decryptor = Decryptor(bookkey, encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(outpath, 'wb'), 'w', **kwds)) as outf: for path in (["mimetype"] + namelist): data = inf.read(path) zi = ZipInfo(path) zi.compress_type = ZIP_DEFLATED if path == "mimetype": zi.compress_type = ZIP_STORED elif path == "META-INF/encryption.xml": # Check if there's still something in there if (decryptor.check_if_remaining()): data = decryptor.get_xml() print( "Adding encryption.xml for the remaining embedded files." ) # We removed DRM, but there's still stuff like obfuscated fonts. else: continue try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass if path == "META-INF/encryption.xml": outf.writestr(zi, data) else: outf.writestr(zi, decryptor.decrypt(path, data)) except: print("Could not decrypt {0:s} because of an exception:\n{1:s}". format(os.path.basename(inpath), traceback.format_exc())) return 2 return 0
def fix_zip(self): if not self.broken: return False self.fp.seek(0, 2) file_len = self.fp.tell() mm = mmap.mmap(self.fp.fileno(), 0, access=mmap.ACCESS_READ) offset = 0 file_list = {} cd_list = {} try: # pass one, parse the zip file while offset + 4 < file_len: hdr_off = mm.find(b"PK", offset) if hdr_off == -1: break hdr_type = mm[hdr_off:hdr_off + 4] if hdr_type == stringFileHeader: # local file header if hdr_off + sizeFileHeader > file_len: break fheader = mm[hdr_off:hdr_off + sizeFileHeader] fheader = struct.unpack(structFileHeader, fheader) start = hdr_off size = sizeFileHeader + fheader[_FH_COMPRESSED_SIZE] + fheader[_FH_FILENAME_LENGTH] + \ fheader[_FH_EXTRA_FIELD_LENGTH] name = mm[hdr_off + sizeFileHeader:hdr_off + sizeFileHeader + fheader[_FH_FILENAME_LENGTH]] file_list[name] = [start, size, fheader] offset = hdr_off + size elif hdr_type == stringCentralDir: if hdr_off + sizeCentralDir > file_len: break centdir = mm[hdr_off:hdr_off + sizeCentralDir] centdir = struct.unpack(structCentralDir, centdir) start = hdr_off size = sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + \ centdir[_CD_COMMENT_LENGTH] name = mm[hdr_off + sizeCentralDir:hdr_off + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]] cd_list[name] = [start, size, centdir] offset = hdr_off + size elif hdr_type == stringEndArchive: offset = hdr_off + sizeEndCentDir else: offset = hdr_off + 1 # Guesses last_cv = 20 last_ea = 0 last_cs = 0 last_dt = (0, 0) # Pass two, repair for filename, (start, end, centdir) in cd_list.items(): if filename not in file_list: continue if isinstance(filename, bytes): x = ZipInfo(filename.decode('utf-8', 'backslashreplace')) else: x = ZipInfo(filename) extra_off = start + sizeCentralDir x.extra = mm[extra_off:extra_off + centdir[_CD_EXTRA_FIELD_LENGTH]] extra_off += centdir[_CD_EXTRA_FIELD_LENGTH] x.comment = mm[extra_off:extra_off + centdir[_CD_EXTRA_FIELD_LENGTH]] x.header_offset = file_list[filename][0] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) last_ea = x.external_attr last_cs = x.create_system last_cv = x.create_version last_dt = (d, t) # noinspection PyProtectedMember x._decodeExtra() # x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x for filename, (start, end, fheader) in file_list.items(): if filename in cd_list: continue x = ZipInfo(filename.decode('utf-8', 'backslashreplace')) x.extra = "" x.comment = "" x.header_offset = file_list[filename][0] x.create_version = last_cv x.create_system = last_cs x.extract_version = fheader[_FH_EXTRACT_VERSION] x.reserved = 0 x.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] x.compress_type = fheader[_FH_COMPRESSION_METHOD] d, t = last_dt x.CRC = fheader[_FH_CRC] x.compress_size = fheader[_FH_COMPRESSED_SIZE] x.file_size = fheader[_FH_UNCOMPRESSED_SIZE] x.volume = 0 x.internal_attr = 0 x.external_attr = last_ea # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) # noinspection PyProtectedMember x._decodeExtra() # x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x finally: mm.close()
def decryptLCPbook(inpath, passphrases, parent_object): if not isLCPbook(inpath): raise LCPError("This is not an LCP-encrypted book") file = ZipFile(open(inpath, 'rb')) license = json.loads(file.read('META-INF/license.lcpl')) print("LCP: Found LCP-encrypted book {0}".format(license["id"])) user_info_string1 = returnUserInfoStringForLicense(license, None) if (user_info_string1 is not None): print("LCP: Account information: " + user_info_string1) # Check algorithm: if license["encryption"][ "profile"] == "http://readium.org/lcp/basic-profile": print("LCP: Book is using lcp/basic-profile encryption.") transform_algo = LCPTransform.secret_transform_basic elif license["encryption"][ "profile"] == "http://readium.org/lcp/profile-1.0": print("LCP: Book is using lcp/profile-1.0 encryption") transform_algo = LCPTransform.secret_transform_profile10 else: file.close() raise LCPError( "Book is using an unknown LCP encryption standard: {0}".format( license["encryption"]["profile"])) if ("algorithm" in license["encryption"]["content_key"] and license["encryption"]["content_key"]["algorithm"] != "http://www.w3.org/2001/04/xmlenc#aes256-cbc"): file.close() raise LCPError( "Book is using an unknown LCP encryption algorithm: {0}".format( license["encryption"]["content_key"]["algorithm"])) key_check = license["encryption"]["user_key"]["key_check"] encrypted_content_key = license["encryption"]["content_key"][ "encrypted_value"] # Prepare a list of encryption keys to test: password_hashes = [] # Some providers hard-code the passphrase in the LCPL file. That doesn't happen often, # but when it does, these files can be decrypted without knowing any passphrase. if "value" in license["encryption"]["user_key"]: try: password_hashes.append( binascii.hexlify( base64.decodebytes(license["encryption"]["user_key"] ["value"].encode())).decode("ascii")) except AttributeError: # Python 2 password_hashes.append( binascii.hexlify( base64.decodestring(license["encryption"]["user_key"] ["value"].encode())).decode("ascii")) if "hex_value" in license["encryption"]["user_key"]: password_hashes.append( binascii.hexlify( bytearray.fromhex(license["encryption"]["user_key"] ["hex_value"])).decode("ascii")) # Hash all the passwords provided by the user: for possible_passphrase in passphrases: algo = "http://www.w3.org/2001/04/xmlenc#sha256" if "algorithm" in license["encryption"]["user_key"]: algo = license["encryption"]["user_key"]["algorithm"] algo, tmp_pw = LCPTransform.userpass_to_hash( possible_passphrase.encode('utf-8'), algo) if tmp_pw is not None: password_hashes.append(tmp_pw) # For all the password hashes, check if one of them decrypts the book: correct_password_hash = None for possible_hash in password_hashes: transformed_hash = transform_algo(possible_hash) try: decrypted = None decrypted = dataDecryptLCP(key_check, transformed_hash) except: pass if (decrypted is not None and decrypted.decode( "ascii", errors="ignore") == license["id"]): # Found correct password hash, hooray! correct_password_hash = transformed_hash break # Print an error message if none of the passwords worked if (correct_password_hash is None): print( "LCP: Tried {0} passphrases, but none of them could decrypt the book ..." .format(len(password_hashes))) # Print password hint, if available if ("text_hint" in license["encryption"]["user_key"] and license["encryption"]["user_key"]["text_hint"] != ""): print( "LCP: The book distributor has given you the following passphrase hint: \"{0}\"" .format(license["encryption"]["user_key"]["text_hint"])) print( "LCP: Enter the correct passphrase in the DeDRM plugin settings, then try again." ) # Print password reset instructions, if available for link in license["links"]: if ("rel" in link and link["rel"] == "hint"): print( "LCP: You may be able to find or reset your LCP passphrase on the following webpage: {0}" .format(link["href"])) break file.close() raise LCPError("No correct passphrase found") print("LCP: Found correct passphrase, decrypting book ...") user_info_string2 = returnUserInfoStringForLicense(license, correct_password_hash) if (user_info_string2 is not None): if (user_info_string1 != user_info_string2): print("LCP: Account information: " + user_info_string2) # Take the key we found and decrypt the content key: decrypted_content_key = dataDecryptLCP(encrypted_content_key, correct_password_hash) if decrypted_content_key is None: raise LCPError("Decrypted content key is None") # Begin decrypting encryption = file.read('META-INF/encryption.xml') decryptor = Decryptor(decrypted_content_key, encryption) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) mimetype = file.read("mimetype").decode("latin-1") if mimetype == "application/pdf": # Check how many PDF files there are. # Usually, an LCP-protected PDF/ZIP is only supposed to contain one # PDF file, but if there are multiple, return a ZIP that contains them all. pdf_files = [] for filename in file.namelist(): if filename.endswith(".pdf"): pdf_files.append(filename) if len(pdf_files) == 0: file.close() raise LCPError( "Error: Book is an LCP-protected PDF, but doesn't contain any PDF files ..." ) elif len(pdf_files) == 1: # One PDF file found - extract and return that. pdfdata = file.read(pdf_files[0]) outputname = parent_object.temporary_file(".pdf").name print("LCP: Successfully decrypted, exporting to {0}".format( outputname)) with open(outputname, 'wb') as f: f.write(decryptor.decrypt(pdf_files[0], pdfdata)) file.close() return outputname else: # Multiple PDFs found outputname = parent_object.temporary_file(".zip").name with closing(ZipFile(open(outputname, 'wb'), 'w', **kwds)) as outfile: for path in pdf_files: data = file.read(path) outfile.writestr(path, decryptor.decrypt(path, data)) print( "LCP: Successfully decrypted a multi-PDF ZIP file, exporting to {0}" .format(outputname)) file.close() return outputname else: # Not a PDF -> EPUB if mimetype == "application/epub+zip": outputname = parent_object.temporary_file(".epub").name else: outputname = parent_object.temporary_file(".zip").name with closing(ZipFile(open(outputname, 'wb'), 'w', **kwds)) as outfile: # mimetype must be 1st file. Remove from list and manually add at the beginning namelist = file.namelist() namelist.remove("mimetype") namelist.remove("META-INF/license.lcpl") for path in (["mimetype"] + namelist): data = file.read(path) zi = ZipInfo(path) if path == "META-INF/encryption.xml": # Check if that's still needed if (decryptor.check_if_remaining()): data = decryptor.get_xml() print( "LCP: Adding encryption.xml for the remaining files." ) else: continue try: oldzi = file.getinfo(path) if path == "mimetype": zi.compress_type = ZIP_STORED else: zi.compress_type = ZIP_DEFLATED zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass if path == "META-INF/encryption.xml": outfile.writestr(zi, data) else: outfile.writestr(zi, decryptor.decrypt(path, data)) print( "LCP: Successfully decrypted, exporting to {0}".format(outputname)) file.close() return outputname
def DecryptBook(bookId): global gCurBook print("[I] Decrypt book: " + bookId + " [" + gBookData[bookId][0] + "]") encFile = os.path.join(gOutDir, ENC_BOOKS_DIR, bookId + EXT_EPUB) decFile = os.path.join(gOutDir, DEC_BOOKS_DIR, bookId + EXT_EPUB) if not os.path.isfile(encFile): print("[E] File not found: " + encFile) return EResult.NO_GOOD if not CheckEpubIntegrity(bookId): print("[E] Corrupted ePub file! (Re-download)") return EResult.NO_GOOD with closing(ZipFile(open(encFile, "rb"))) as inf: namelist = set(inf.namelist()) if ENCRYPTION_XML not in namelist: print("[W] Can't find " + ENCRYPTION_XML + ". Assume it's DRM-free book") if os.path.isfile(decFile): os.remove(decFile) shutil.copyfile(encFile, decFile) return EResult.OKAY for name in META_NAMES: namelist.remove(name) try: # get book AES key from META-INF/encryption.xml encryption = etree.fromstring(inf.read(ENCRYPTION_XML)) aesKeyB64 = encryption.findtext('.//enc:CipherValue', None, NSMAP) if aesKeyB64 is None: print("[E] Can't find encrypted AES key!") return EResult.NO_GOOD for k in gRsaKeys: bookkey = k.decrypt(base64.b64decode(aesKeyB64)) if bookkey is not None: break if bookkey is None: print("[E] Can't decrypt AES key!") return EResult.NO_GOOD gCurBook._id = bookId gCurBook._aeskey = ''.join(hex(x)[2:].zfill(2) for x in bookkey).upper() print(" AES KEY = {0}".format(gCurBook._aeskey)) decryptor = Decryptor(bookkey, encryption) if len(decryptor._encFontIdpf) > 0: decryptor.SetBookUid(GetBookUid(decryptor, inf)) opfs = GetOpfNamesFromEpub(inf) if len(opfs) > 1: print("[W] Num of rootfile = " + str(len(opfs))) kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(decFile, 'wb'), 'w', **kwds)) as outf: zi = ZipInfo(MIMETYPE) zi.compress_type = ZIP_STORED try: # if the mimetype is present, get its info, including time-stamp oldzi = inf.getinfo(MIMETYPE) # copy across fields to be preserved zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass outf.writestr(zi, inf.read(MIMETYPE)) # process files in ePub for path in namelist: data = inf.read(path) zi = ZipInfo(path) zi.compress_type = ZIP_DEFLATED try: # get the file info, including time-stamp oldzi = inf.getinfo(path) # copy across useful fields zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr # external attributes are dependent on the create system, so copy both. zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system except: pass data = decryptor.decrypt(path, data) if path in opfs: if bookId in gTitleMap: data = ChangeTitle(data, gTitleMap[bookId]) if bookId in gAuthorMap: data = ChangeAuthor(data, gAuthorMap[bookId]) ShowBookInfo(data) outf.writestr(zi, data) except Exception as e: print("[E] Can't decrypt book! (" + str(e) + ")") if os.path.isfile(decFile): os.remove(decFile) return EResult.NO_GOOD RenameBook(bookId) SaveBookInfo() return EResult.OKAY
def get_zip_infos(self, *filenames): """Read in the table of contents for the ZIP file.""" fp = self.fp max_file_count = self.max_file_count if not fp: raise RuntimeError( "Attempt to read ZIP archive that was already closed") filenames = set(filenames) if len(filenames) == 0: return try: endrec = _EndRecData(fp) except OSError: raise BadZipFile("File is not a zip file") if not endrec: raise BadZipFile("File is not a zip file") size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd if endrec[_ECD_SIGNATURE] == stringEndArchive64: # If Zip64 extension structures are present, account for them concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) # start_dir: Position of start of central directory start_dir = offset_cd + concat fp.seek(start_dir, 0) data = fp.read(size_cd) fp = BytesIO(data) total = 0 file_count = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if len(centdir) != sizeCentralDir: raise BadZipFile("Truncated central directory") centdir = struct.unpack(structCentralDir, centdir) if centdir[_CD_SIGNATURE] != stringCentralDir: raise BadZipFile("Bad magic number for central directory") filename = fp.read(centdir[_CD_FILENAME_LENGTH]) flags = centdir[5] if flags & _UTF8_EXTENSION_FLAG: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat # update total bytes read from central directory total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH]) file_count += 1 if max_file_count is not None and file_count > max_file_count: raise TooManyFiles('Too many files in egg') if x.filename in filenames: filenames.discard(x.filename) yield x if len(filenames) == 0: return
def removeHTMLwatermarks(object, path_to_ebook): try: inf = ZipFile(open(path_to_ebook, 'rb')) namelist = inf.namelist() modded_names = [] modded_contents = [] count_adept = 0 count_pocketbook = 0 count_lemonink_invisible = 0 count_lemonink_visible = 0 lemonink_trackingID = None for file in namelist: if not (file.endswith('.html') or file.endswith('.xhtml') or file.endswith('.xml')): continue try: file_str = inf.read(file).decode("utf-8") str_new = file_str # Remove Adobe ADEPT watermarks # Match optional newline at the beginning, then a "meta" tag with name = "Adept.expected.resource" or "Adept.resource" # and either a "value" or a "content" element with an Adobe UUID pre_remove = str_new str_new = re.sub( r'((\r\n|\r|\n)\s*)?\<meta\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s*\/>', '', str_new) str_new = re.sub( r'((\r\n|\r|\n)\s*)?\<meta\s+(content|value)=\"urn:uuid:[0-9a-fA-F\-]+\"\s+name=\"(Adept\.resource|Adept\.expected\.resource)\"\s*\/>', '', str_new) if (str_new != pre_remove): count_adept += 1 # Remove Pocketbook watermarks pre_remove = str_new str_new = re.sub( r'\<div style\=\"padding\:0\;border\:0\;text\-indent\:0\;line\-height\:normal\;margin\:0 1cm 0.5cm 1cm\;[^\"]*opacity:0.0\;[^\"]*text\-decoration\:none\;[^\"]*background\:none\;[^\"]*\"\>(.*?)\<\/div\>', '', str_new) if (str_new != pre_remove): count_pocketbook += 1 # Remove eLibri / LemonInk watermark # Run this in a loop, as it is possible a file has been watermarked twice ... while True: pre_remove = str_new unique_id = re.search( r'<body[^>]+class="[^"]*(t0x[0-9a-fA-F]{25})[^"]*"[^>]*>', str_new) if (unique_id): lemonink_trackingID = unique_id.groups()[0] count_lemonink_invisible += 1 str_new = re.sub(lemonink_trackingID, '', str_new) pre_remove = str_new pm = r'(<body[^>]+class="[^"]*"[^>]*>)' pm += r'\<div style\=\'padding\:0\;border\:0\;text\-indent\:0\;line\-height\:normal\;margin\:0 1cm 0.5cm 1cm\;[^\']*text\-decoration\:none\;[^\']*background\:none\;[^\']*\'\>(.*?)</div>' pm += r'\<div style\=\'padding\:0\;border\:0\;text\-indent\:0\;line\-height\:normal\;margin\:0 1cm 0.5cm 1cm\;[^\']*text\-decoration\:none\;[^\']*background\:none\;[^\']*\'\>(.*?)</div>' str_new = re.sub(pm, r'\1', str_new) if (str_new != pre_remove): count_lemonink_visible += 1 else: break except: traceback.print_exc() continue if (file_str == str_new): continue modded_names.append(file) modded_contents.append(str_new) if len(modded_names) == 0: # No file modified, return original return path_to_ebook if len(modded_names) != len(modded_contents): # Something went terribly wrong, return original print("Watermark: Error during watermark removal") return path_to_ebook # Re-package with modified files: namelist.remove("mimetype") try: output = object.temporary_file(".epub").name kwds = dict(compression=ZIP_DEFLATED, allowZip64=False) with closing(ZipFile(open(output, 'wb'), 'w', **kwds)) as outf: for path in (["mimetype"] + namelist): data = inf.read(path) try: modded_index = None modded_index = modded_names.index(path) except: pass if modded_index is not None: # Found modified file - replace contents data = modded_contents[modded_index] zi = ZipInfo(path) oldzi = inf.getinfo(path) try: zi.compress_type = oldzi.compress_type if path == "mimetype": zi.compress_type = ZIP_STORED zi.date_time = oldzi.date_time zi.comment = oldzi.comment zi.extra = oldzi.extra zi.internal_attr = oldzi.internal_attr zi.external_attr = oldzi.external_attr zi.create_system = oldzi.create_system if any(ord(c) >= 128 for c in path) or any( ord(c) >= 128 for c in zi.comment): # If the file name or the comment contains any non-ASCII char, set the UTF8-flag zi.flag_bits |= 0x800 except: pass outf.writestr(zi, data) except: traceback.print_exc() return path_to_ebook if (count_adept > 0): print( "Watermark: Successfully stripped {0} ADEPT watermark(s) from ebook." .format(count_adept)) if (count_lemonink_invisible > 0 or count_lemonink_visible > 0): print( "Watermark: Successfully stripped {0} visible and {1} invisible LemonInk watermark(s) (\"{2}\") from ebook." .format(count_lemonink_visible, count_lemonink_invisible, lemonink_trackingID)) if (count_pocketbook > 0): print( "Watermark: Successfully stripped {0} Pocketbook watermark(s) from ebook." .format(count_pocketbook)) return output except: traceback.print_exc() return path_to_ebook
def _RealGetContents(self): """Read in the table of contents for the ZIP file.""" try: endrec = _EndRecData(self.url) except IOError: raise BadZipfile("File is not a zip file") if not endrec: raise BadZipfile, "File is not a zip file" if self.debug > 1: print endrec size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory self.comment = endrec[_ECD_COMMENT] # archive comment # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd # if endrec[_ECD_SIGNATURE] == stringEndArchive64: # # If Zip64 extension structures are present, account for them # concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) if self.debug > 2: inferred = concat + offset_cd print "given, inferred, offset", offset_cd, inferred, concat # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir + size_cd - 1) data = ECD.read() ECD.close() fp = cStringIO.StringIO(data) total = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if centdir[0:4] != stringCentralDir: raise BadZipfile, "Bad magic number for central directory" centdir = struct.unpack(structCentralDir, centdir) if self.debug > 2: print centdir filename = fp.read(centdir[_CD_FILENAME_LENGTH]) # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] ( x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size, ) = centdir[1:12] x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x # update total bytes read from central directory total = ( total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH] ) if self.debug > 2: print "total", total
def next(self): """Return the next member of the archive as a ZipInfo object. Returns None if there is no more available. This method is analogous to TarFile.next(). We construct a ZipInfo object using the information stored in the next file header. The logic here is based on the implementation of ZipFile._RealGetContents(), which constructs a ZipInfo object from information in a central directory file header, but modified to work with the file-header-specific struct (for the implementation of ZipFile._RealGetContents(), see https://github.com/python/cpython/blob/048f54dc75d51e8a1c5822ab7b2828295192aaa5/Lib/zipfile.py#L1316). """ fp = self.fp # First, advance to the next header, if needed. fp.read(self._next_header_pos - fp.tell()) # Read the next header. fheader = fp.read(sizeFileHeader) if len(fheader) != sizeFileHeader: raise BadZipFile("Truncated file header") fheader = struct.unpack(structFileHeader, fheader) if fheader[_FH_SIGNATURE] == stringCentralDir: # We've reached the central directory. This means that we've finished iterating through # all entries in the zip file. We can do this check because the file header signature # and central directory signature are stored in the same spot (index 0) and with the same format. self._loaded = True return None if fheader[_FH_SIGNATURE] != stringFileHeader: raise BadZipFile("Bad magic number for file header") filename = fp.read(fheader[_FH_FILENAME_LENGTH]) flags = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) x.header_offset = self._next_header_pos # The file header stores nearly all the same information needed for ZipInfo as what the # central directory file header stores, except for a couple of missing fields. # We just set them to 0 here. x.comment = 0 x.create_version, x.create_system = 0, 0 x.volume, x.internal_attr, x.external_attr = 0, 0, 0 (x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = fheader[1:10] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) # Convert date/time code to (year, month, day, hour, min, sec) # This comes from the original cpython code. x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() self.filelist.append(x) self.NameToInfo[x.filename] = x self._next_header_pos = (fp.tell() + x.compress_size ) # Beginning of the next file's header. return x