def _RealGetContents(self): """Read in the table of contents for the ZIP file.""" try: endrec = _EndRecData(self.url) except IOError: raise BadZipfile("File is not a zip file") if not endrec: raise BadZipfile, "File is not a zip file" if self.debug > 1: print endrec size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory self.comment = endrec[_ECD_COMMENT] # archive comment # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd # if endrec[_ECD_SIGNATURE] == stringEndArchive64: # # If Zip64 extension structures are present, account for them # concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) if self.debug > 2: inferred = concat + offset_cd print "given, inferred, offset", offset_cd, inferred, concat # self.start_dir: Position of start of central directory self.start_dir = offset_cd + concat ECD = _http_get_partial_data(self.url, self.start_dir, self.start_dir + size_cd - 1) data = ECD.read() ECD.close() fp = cStringIO.StringIO(data) total = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if centdir[0:4] != stringCentralDir: raise BadZipfile, "Bad magic number for central directory" centdir = struct.unpack(structCentralDir, centdir) if self.debug > 2: print centdir filename = fp.read(centdir[_CD_FILENAME_LENGTH]) # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] ( x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size, ) = centdir[1:12] x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x # update total bytes read from central directory total = ( total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH] ) if self.debug > 2: print "total", total
def get_zip_infos(self, *filenames): """Read in the table of contents for the ZIP file.""" fp = self.fp max_file_count = self.max_file_count if not fp: raise RuntimeError( "Attempt to read ZIP archive that was already closed") filenames = set(filenames) if len(filenames) == 0: return try: endrec = _EndRecData(fp) except OSError: raise BadZipFile("File is not a zip file") if not endrec: raise BadZipFile("File is not a zip file") size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd if endrec[_ECD_SIGNATURE] == stringEndArchive64: # If Zip64 extension structures are present, account for them concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) # start_dir: Position of start of central directory start_dir = offset_cd + concat fp.seek(start_dir, 0) data = fp.read(size_cd) fp = BytesIO(data) total = 0 file_count = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if len(centdir) != sizeCentralDir: raise BadZipFile("Truncated central directory") centdir = struct.unpack(structCentralDir, centdir) if centdir[_CD_SIGNATURE] != stringCentralDir: raise BadZipFile("Bad magic number for central directory") filename = fp.read(centdir[_CD_FILENAME_LENGTH]) flags = centdir[5] if flags & _UTF8_EXTENSION_FLAG: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat # update total bytes read from central directory total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH]) file_count += 1 if max_file_count is not None and file_count > max_file_count: raise TooManyFiles('Too many files in egg') if x.filename in filenames: filenames.discard(x.filename) yield x if len(filenames) == 0: return
def get_zip_infos(self, *filenames): """Read in the table of contents for the ZIP file.""" fp = self.fp max_file_count = self.max_file_count if not fp: raise RuntimeError( "Attempt to read ZIP archive that was already closed") filenames = set(filenames) if len(filenames) == 0: return try: endrec = _EndRecData(fp) except OSError: raise BadZipFile("File is not a zip file") if not endrec: raise BadZipFile("File is not a zip file") size_cd = endrec[_ECD_SIZE] # bytes in central directory offset_cd = endrec[_ECD_OFFSET] # offset of central directory # "concat" is zero, unless zip was concatenated to another file concat = endrec[_ECD_LOCATION] - size_cd - offset_cd if endrec[_ECD_SIGNATURE] == stringEndArchive64: # If Zip64 extension structures are present, account for them concat -= (sizeEndCentDir64 + sizeEndCentDir64Locator) # start_dir: Position of start of central directory start_dir = offset_cd + concat fp.seek(start_dir, 0) data = fp.read(size_cd) fp = BytesIO(data) total = 0 file_count = 0 while total < size_cd: centdir = fp.read(sizeCentralDir) if len(centdir) != sizeCentralDir: raise BadZipFile("Truncated central directory") centdir = struct.unpack(structCentralDir, centdir) if centdir[_CD_SIGNATURE] != stringCentralDir: raise BadZipFile("Bad magic number for central directory") filename = fp.read(centdir[_CD_FILENAME_LENGTH]) flags = centdir[5] if flags & _UTF8_EXTENSION_FLAG: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() x.header_offset = x.header_offset + concat # update total bytes read from central directory total = (total + sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + centdir[_CD_COMMENT_LENGTH]) file_count += 1 if max_file_count is not None and file_count > max_file_count: raise TooManyFiles('Too many files in egg') if x.filename in filenames: filenames.discard(x.filename) yield x if len(filenames) == 0: return
def fix_zip(self): if not self.broken: return False self.fp.seek(0, 2) file_len = self.fp.tell() mm = mmap.mmap(self.fp.fileno(), 0, access=mmap.ACCESS_READ) offset = 0 file_list = {} cd_list = {} try: # pass one, parse the zip file while offset + 4 < file_len: hdr_off = mm.find(b"PK", offset) if hdr_off == -1: break hdr_type = mm[hdr_off:hdr_off + 4] if hdr_type == stringFileHeader: # local file header if hdr_off + sizeFileHeader > file_len: break fheader = mm[hdr_off:hdr_off + sizeFileHeader] fheader = struct.unpack(structFileHeader, fheader) start = hdr_off size = sizeFileHeader + fheader[_FH_COMPRESSED_SIZE] + fheader[_FH_FILENAME_LENGTH] + \ fheader[_FH_EXTRA_FIELD_LENGTH] name = mm[hdr_off + sizeFileHeader:hdr_off + sizeFileHeader + fheader[_FH_FILENAME_LENGTH]] file_list[name] = [start, size, fheader] offset = hdr_off + size elif hdr_type == stringCentralDir: if hdr_off + sizeCentralDir > file_len: break centdir = mm[hdr_off:hdr_off + sizeCentralDir] centdir = struct.unpack(structCentralDir, centdir) start = hdr_off size = sizeCentralDir + centdir[_CD_FILENAME_LENGTH] + centdir[_CD_EXTRA_FIELD_LENGTH] + \ centdir[_CD_COMMENT_LENGTH] name = mm[hdr_off + sizeCentralDir:hdr_off + sizeCentralDir + centdir[_CD_FILENAME_LENGTH]] cd_list[name] = [start, size, centdir] offset = hdr_off + size elif hdr_type == stringEndArchive: offset = hdr_off + sizeEndCentDir else: offset = hdr_off + 1 # Guesses last_cv = 20 last_ea = 0 last_cs = 0 last_dt = (0, 0) # Pass two, repair for filename, (start, end, centdir) in cd_list.items(): if filename not in file_list: continue if isinstance(filename, bytes): x = ZipInfo(filename.decode('utf-8', 'backslashreplace')) else: x = ZipInfo(filename) extra_off = start + sizeCentralDir x.extra = mm[extra_off:extra_off + centdir[_CD_EXTRA_FIELD_LENGTH]] extra_off += centdir[_CD_EXTRA_FIELD_LENGTH] x.comment = mm[extra_off:extra_off + centdir[_CD_EXTRA_FIELD_LENGTH]] x.header_offset = file_list[filename][0] (x.create_version, x.create_system, x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = centdir[1:12] x.volume, x.internal_attr, x.external_attr = centdir[15:18] # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) last_ea = x.external_attr last_cs = x.create_system last_cv = x.create_version last_dt = (d, t) # noinspection PyProtectedMember x._decodeExtra() # x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x for filename, (start, end, fheader) in file_list.items(): if filename in cd_list: continue x = ZipInfo(filename.decode('utf-8', 'backslashreplace')) x.extra = "" x.comment = "" x.header_offset = file_list[filename][0] x.create_version = last_cv x.create_system = last_cs x.extract_version = fheader[_FH_EXTRACT_VERSION] x.reserved = 0 x.flag_bits = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] x.compress_type = fheader[_FH_COMPRESSION_METHOD] d, t = last_dt x.CRC = fheader[_FH_CRC] x.compress_size = fheader[_FH_COMPRESSED_SIZE] x.file_size = fheader[_FH_UNCOMPRESSED_SIZE] x.volume = 0 x.internal_attr = 0 x.external_attr = last_ea # Convert date/time code to (year, month, day, hour, min, sec) x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) # noinspection PyProtectedMember x._decodeExtra() # x.filename = x._decodeFilename() self.filelist.append(x) self.NameToInfo[x.filename] = x finally: mm.close()
def next(self): """Return the next member of the archive as a ZipInfo object. Returns None if there is no more available. This method is analogous to TarFile.next(). We construct a ZipInfo object using the information stored in the next file header. The logic here is based on the implementation of ZipFile._RealGetContents(), which constructs a ZipInfo object from information in a central directory file header, but modified to work with the file-header-specific struct (for the implementation of ZipFile._RealGetContents(), see https://github.com/python/cpython/blob/048f54dc75d51e8a1c5822ab7b2828295192aaa5/Lib/zipfile.py#L1316). """ fp = self.fp # First, advance to the next header, if needed. fp.read(self._next_header_pos - fp.tell()) # Read the next header. fheader = fp.read(sizeFileHeader) if len(fheader) != sizeFileHeader: raise BadZipFile("Truncated file header") fheader = struct.unpack(structFileHeader, fheader) if fheader[_FH_SIGNATURE] == stringCentralDir: # We've reached the central directory. This means that we've finished iterating through # all entries in the zip file. We can do this check because the file header signature # and central directory signature are stored in the same spot (index 0) and with the same format. self._loaded = True return None if fheader[_FH_SIGNATURE] != stringFileHeader: raise BadZipFile("Bad magic number for file header") filename = fp.read(fheader[_FH_FILENAME_LENGTH]) flags = fheader[_FH_GENERAL_PURPOSE_FLAG_BITS] if flags & 0x800: # UTF-8 file names extension filename = filename.decode('utf-8') else: # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information x = ZipInfo(filename) x.extra = fp.read(fheader[_FH_EXTRA_FIELD_LENGTH]) x.header_offset = self._next_header_pos # The file header stores nearly all the same information needed for ZipInfo as what the # central directory file header stores, except for a couple of missing fields. # We just set them to 0 here. x.comment = 0 x.create_version, x.create_system = 0, 0 x.volume, x.internal_attr, x.external_attr = 0, 0, 0 (x.extract_version, x.reserved, x.flag_bits, x.compress_type, t, d, x.CRC, x.compress_size, x.file_size) = fheader[1:10] if x.extract_version > MAX_EXTRACT_VERSION: raise NotImplementedError("zip file version %.1f" % (x.extract_version / 10)) # Convert date/time code to (year, month, day, hour, min, sec) # This comes from the original cpython code. x._raw_time = t x.date_time = ((d >> 9) + 1980, (d >> 5) & 0xF, d & 0x1F, t >> 11, (t >> 5) & 0x3F, (t & 0x1F) * 2) x._decodeExtra() self.filelist.append(x) self.NameToInfo[x.filename] = x self._next_header_pos = (fp.tell() + x.compress_size ) # Beginning of the next file's header. return x