def add_inode(self, fd, offset, factories): """ We think we have a zip file here. """ b = Zip.Buffer(fd=fd)[offset:] try: header = Zip.ZipFileHeader(b) size = int(header['uncompr_size']) compressed_length = int(header['compr_size']) ## Some zip programs seem to leave this at 0 - because its ## already in the central directory. Unfortunately the ## carver currently does not look at the central directory ## - so we just make it a reasonable value if compressed_length==0: compressed_length = 100*1024 name = header['zip_path'].get_value() if len(name)==0 or invalid_filename.search(name): pyflaglog.log(pyflaglog.DEBUG, "Thought the name %r is invalid - skipping file" % name[:10]) return 10 header_offset = header['data'].buffer.offset except: return 10 new_inode = "%s|Z%s:%s" % (fd.inode, offset, compressed_length) self._add_inode(new_inode, size, name, fd, factories) return size
class ZipFile(File): """ A file like object to read files from within zip files. We essentially decompress the file on the disk because the file may be exceptionally large. """ specifier = 'Z' def __init__(self, case, fd, inode): File.__init__(self, case, fd, inode) ## Make sure our parent is cached: self.fd.cache() ## Parse out inode - if we got the compressed length provided, ## we use that, otherwise we calculate it from the zipfile ## header parts = inode.split('|') ourpart = parts[-1][1:] try: offset, size = ourpart.split(":") self.compressed_length = int(size) offset = int(offset) except: offset = int(ourpart) self.offset = offset ## Ensure that we can read the file header: b = Zip.Buffer(fd=fd)[offset:] self.header = Zip.ZipFileHeader(b) ## This is sometimes invalid and set to zero - should we query ## the db? self.size = int(self.header['uncompr_size']) if not self.compressed_length: self.compressed_length = int(self.header['compr_size']) self.type = int(self.header['compression_method']) ## Where does the data start? self.init()
def __init__(self, reassembler): self.r = reassembler ## Try to load the central directory if possible: This may ## fail if the cd is fragmented. FIXME: be able to handle ## fragmentation at the CD. cd_x = self.r.get_point("Central_Directory") self.cds = [] if cd_x: b = Buffer(self.r)[cd_x:] while 1: try: cd = Zip.CDFileHeader(b) except RuntimeError, e: print "Finished reading CD (%s items)" % len(self.cds) break self.cds.append(cd) b = b[cd.size():]
def build_maps(self, index_file): hits = self.load_index(index_file) image_fd = open(self.args[0], 'r') zip_files = {} for ecd_offset in hits['EndCentralDirectory']: ## Each EndCentralDirectory represents a new Zip file r = Carver.Reassembler(None) b = Buffer(image_fd)[ecd_offset:] ecd = Zip.EndCentralDirectory(b) print "End Central Directory at offset %s:" % (ecd_offset, ) ## Find the CD: offset_of_cd = ecd['offset_of_cd'].get_value() ## Check if the cd is where we think it should be: possibles = [] for x in hits['CDFileHeader']: if x == ecd_offset - ecd['size_of_cd'].get_value(): ## No fragmentation in CD: print "No fragmentation in Central Directory at offset %s discovered... good!" % x possibles = [ x, ] break if x % 512 == offset_of_cd % 512: print "Possible Central Directory Starts at %s" % x possibles.append(x) ## FIXME: this needs to be made to estimate the most similar ## possibility - we really have very little to go on here - ## how can we distinguish between two different CDs that occur ## in the same spot? I dont think its very likely in reality ## because the CD will be at the end of the zip file which ## will be of varying sizes. ## We probably should prefer the CD found at image offset ## of ecd - ecd['size_of_cd'] which will be the case if ## the CD is not fragmented. ## For now we go with the first possibility: cd_image_offset = possibles[0] ## Identify the central directory: r.add_point(offset_of_cd, cd_image_offset, "Central_Directory") ## We can calculate the offset of ecd here: r.add_point(offset_of_cd + ecd['size_of_cd'].get_value(), ecd_offset, "End_Central_Directory") ## The file end - this is used to stop the carver: r.add_point( offset_of_cd + ecd['size_of_cd'].get_value() + ecd.size(), ecd_offset + ecd.size(), "EOF") ## Read all entries in the CD and try to locate their ## corresponding ZipFileHeaders: for i in range(ecd['total_entries_in_cd_on_disk'].get_value()): b = Buffer(image_fd)[cd_image_offset:] cd = Zip.CDFileHeader(b) ## Now try to find the ZipFileHeader for this cd entry: fh_offset = cd['relative_offset_local_header'].get_value() for fh_image_offset in hits['ZipFileHeader']: ## Apply the modulo rule: if fh_image_offset % 512 == fh_offset % 512: print "Possible File header at image offset %s" % fh_image_offset b = Buffer(image_fd)[fh_image_offset:] try: fh = Zip.ZipFileHeader(b) except: print "Oops - no File Header here... continuing" continue ## Is it the file we expect? path = fh['zip_path'].get_value() expected_path = cd['filename'].get_value() ## Check the paths: if path and expected_path and path != expected_path: print "This ZipFileHeader is for %s, while we wanted %s" % ( path, expected_path) continue ## Check the expected lengths with the central directory: cd_compr_size = cd['compressed_size'].get_value() cd_uncompr_size = cd['uncompr_size'].get_value() fh_comr_size = fh['compr_size'].get_value() fh_uncomr_size = fh['uncompr_size'].get_value() if cd_compr_size and fh_comr_size and cd_compr_size != fh_comr_size: print "Compressed size does not match (%s - expected %s)" % ( cd_compr_size, fh_comr_size) continue if cd_uncompr_size and fh_uncomr_size and cd_uncompr_size != fh_uncomr_size: print "Uncompressed size does not match (%s - expected %s)" % ( cd_uncompr_size, fh_uncomr_size) continue print "Will use Zip File Header at %s." % ( fh_image_offset) ## Identify point: r.add_point(fh_offset, fh_image_offset, "File_%s" % path) ## Progress to the next file in the archive: cd_image_offset += cd.size() r.save_map("%s.map" % ecd_offset)
def decode_ecd_header(self, b, length_to_test): ecd = Zip.EndCentralDirectory(b) print "Found ECD %s" % ecd return ecd.size()
def decode_cd_file(self, b, length_to_test): cd = Zip.CDFileHeader(b) print "Found CD Header: %s" % cd['filename'] return cd.size()
def decode_file(self, b, length_to_test): """ Attempts to decode and verify a ZipFileHeader """ fh = Zip.ZipFileHeader(b) #print "Zip File Header @ offset %s (name %s) " % (b.offset, fh['zip_path']) ## The following is necessary because some Zip writers do not ## write the same information in both the ZipFileHeader and ## CDFileHeader - FIXME: what do we do if the information is ## actually different but set? (This is a common way for ## malware to break email filtering or virus scanners ala zip ## bombs). compression_method = fh['compression_method'].get_value() compressed_size = fh['compr_size'].get_value() uncompr_size = fh['uncompr_size'].get_value() crc32 = fh['crc32'].get_value() for cd in self.cds: if cd['filename'] == fh['zip_path']: ## Found the CD entry for our file, if any of the ## above parameters are not set in the ZipFileHeader, ## try to get them from the CD: if not compression_method: compression_method = cd['compression'].get_value() if not compressed_size: compressed_size = cd['compressed_size'].get_value() if not uncompr_size: uncompr_size = cd['uncompr_size'].get_value() if not crc32: crc32 = cd['crc-32'].get_value() ## Deflate: if compression_method == 8: dc = zlib.decompressobj(-15) crc = 0 self.offset = b.offset + fh.size() self.r.seek(self.offset) total = 0 to_read = compressed_size while to_read > 0: cdata = self.r.read(min(SECTOR_SIZE, to_read)) #print "Read %s" % len(cdata) to_read -= len(cdata) data = dc.decompress(cdata) total += len(data) self.offset += len(cdata) crc = binascii.crc32(data, crc) ## Only test as much as was asked if self.offset > length_to_test: return length_to_test ## Finalise the data: ex = dc.decompress('Z') + dc.flush() total += len(ex) crc = binascii.crc32(ex, crc) if total != uncompr_size: print "Total decompressed data: %s (%s)" % (total, uncompr_size) raise IOError( "Decompressed file does not have the expected length") if crc < 0: crc = crc + (1 << 32) if crc != crc32: print "CRC is %d %s" % (crc, crc32) raise IOError("CRC does not match") else: print "Unable to verify compression_method %s - not implemented, skipping file" % compression_method ## Sometimes there is some padding before the next file is ## written. We try to account for this if possible by scanning ## ahead a little bit. This occurs if the file has a data ## descriptor record. We ignore this record because its values ## are usually present in the CD anyway. total_size = fh.size() + compressed_size data = self.r.read(SECTOR_SIZE) m = zip_header_re.search(data) if m: total_size += m.start() #print fh return total_size