def hashing(disk_path, meta_path, chunk_size=4096, window_size=512): # TODO: need more efficient implementation, e.g. bisect # generate hash of base disk # disk_path : raw disk path # chunk_size : hash chunk size # window_size : slicing window size prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout) total_iteration = os.path.getsize(disk_path)/window_size iter_count = 0 prog_interval = 100 disk_file = open(disk_path, "rb") out_file = open(meta_path, "w+b") data = disk_file.read(chunk_size) if (not data) or len(data) < chunk_size: raise DiskError("invalid raw disk size") entire_hashing = sha256() entire_hashing.update(data) s_offset = 0 data_len = len(data) hash_dic = dict() while True: if (iter_count)%prog_interval == 0: prog_bar.process(100.0*prog_interval/total_iteration) prog_bar.show_progress() iter_count += 1 hashed_data = sha256(data).digest() if hash_dic.get(hashed_data) == None: hash_dic[hashed_data]= (hashed_data, s_offset, data_len) added_data = disk_file.read(window_size) if (not added_data) or len(added_data) != window_size: break s_offset += window_size data = data[window_size:] + added_data entire_hashing.update(added_data) for hashed_data, s_offset, data_len in list(hash_dic.values()): out_file.write(struct.pack("!QI%ds" % len(hashed_data), s_offset, data_len, hashed_data)) disk_file.close() out_file.close() return entire_hashing.hexdigest()
def _get_mem_hash(self, fin, end_offset, hash_list, **kwargs): # kwargs # diff: compare hash_list with self object # free_pfn_dict: free memory physical frame number as a dictionary {'#':1, ... } diff = kwargs.get("diff", None) apply_free_memory = kwargs.get("apply_free_memory", True) free_pfn_dict = kwargs.get("free_pfn_dict", None) LOG.info("Get hash list of memory page") prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout) total_size = end_offset ram_offset = 0 freed_page_counter = 0 base_hashlist_length = len(self.hash_list) while total_size != ram_offset: data = fin.read(Memory.RAM_PAGE_SIZE) if not diff: hash_list.append((ram_offset, len(data), sha256(data).digest())) else: # compare input with hash or corresponding base memory, save only when it is different hash_list_index = ram_offset/Memory.RAM_PAGE_SIZE if hash_list_index < base_hashlist_length: self_hash_value = self.hash_list[hash_list_index][2] else: self_hash_value = None if self_hash_value != sha256(data).digest(): is_free_memory = False if (free_pfn_dict != None) and \ (free_pfn_dict.get(long(ram_offset/Memory.RAM_PAGE_SIZE), None) == 1): is_free_memory = True if is_free_memory and apply_free_memory: # Do not compare. It is free memory freed_page_counter += 1 else: #get xdelta comparing self.raw source_data = self.get_raw_data(ram_offset, len(data)) #save xdelta as DeltaItem only when it gives smaller try: if source_data == None: raise IOError("launch memory snapshot is bigger than base vm") patch = tool.diff_data(source_data, data, 2*len(source_data)) if len(patch) < len(data): delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_XDELTA, data_len=len(patch), data=patch) else: raise IOError("xdelta3 patch is bigger than origianl") except IOError as e: #LOG.info("xdelta failed, so save it as raw (%s)" % str(e)) delta_item = DeltaItem(DeltaItem.DELTA_MEMORY, ram_offset, len(data), hash_value=sha256(data).digest(), ref_id=DeltaItem.REF_RAW, data_len=len(data), data=data) hash_list.append(delta_item) # memory over-usage protection if len(hash_list) > Memory.RAM_PAGE_SIZE*1000000: # 400MB for hashlist raise MemoryError("possibly comparing with wrong base VM") ram_offset += len(data) # print progress bar for every 100 page if (ram_offset % (Memory.RAM_PAGE_SIZE*100)) == 0: prog_bar.set_percent(100.0*ram_offset/total_size) prog_bar.show_progress() prog_bar.finish() return freed_page_counter