Example #1
0
def hashing(disk_path, meta_path, chunk_size=4096, window_size=512):
    # TODO: need more efficient implementation, e.g. bisect
    # generate hash of base disk
    # disk_path : raw disk path
    # chunk_size : hash chunk size
    # window_size : slicing window size

    prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout)
    total_iteration = os.path.getsize(disk_path)/window_size
    iter_count = 0
    prog_interval = 100

    disk_file = open(disk_path, "rb")
    out_file = open(meta_path, "w+b")
    data = disk_file.read(chunk_size)
    if (not data) or len(data) < chunk_size:
        raise DiskError("invalid raw disk size")

    entire_hashing = sha256()
    entire_hashing.update(data)

    s_offset = 0
    data_len = len(data)
    hash_dic = dict()
    while True:
        if (iter_count)%prog_interval == 0:
            prog_bar.process(100.0*prog_interval/total_iteration)
            prog_bar.show_progress()
        iter_count += 1

        hashed_data = sha256(data).digest()
        if hash_dic.get(hashed_data) == None:
            hash_dic[hashed_data]= (hashed_data, s_offset, data_len)

        added_data = disk_file.read(window_size)
        if (not added_data) or len(added_data) != window_size:
            break
        s_offset += window_size
        data = data[window_size:] + added_data
        entire_hashing.update(added_data)

    for hashed_data, s_offset, data_len in list(hash_dic.values()):
        out_file.write(struct.pack("!QI%ds" % len(hashed_data), 
            s_offset, data_len, hashed_data))
    disk_file.close()
    out_file.close()

    return entire_hashing.hexdigest()
Example #2
0
    def _get_mem_hash(self, fin, end_offset, hash_list, **kwargs):
        # kwargs
        #  diff: compare hash_list with self object
        #  free_pfn_dict: free memory physical frame number as a dictionary {'#':1, ... }
        diff = kwargs.get("diff", None)
        apply_free_memory = kwargs.get("apply_free_memory", True)
        free_pfn_dict = kwargs.get("free_pfn_dict", None)
        LOG.info("Get hash list of memory page")
        prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout)

        total_size = end_offset
        ram_offset = 0
        freed_page_counter = 0
        base_hashlist_length = len(self.hash_list)
        while total_size != ram_offset:
            data = fin.read(Memory.RAM_PAGE_SIZE)
            if not diff:
                hash_list.append((ram_offset, len(data), sha256(data).digest()))
            else:
                # compare input with hash or corresponding base memory, save only when it is different
                hash_list_index = ram_offset/Memory.RAM_PAGE_SIZE
                if hash_list_index < base_hashlist_length:
                    self_hash_value = self.hash_list[hash_list_index][2]
                else:
                    self_hash_value = None

                if self_hash_value != sha256(data).digest():
                    is_free_memory = False
                    if (free_pfn_dict != None) and \
                            (free_pfn_dict.get(long(ram_offset/Memory.RAM_PAGE_SIZE), None) == 1):
                        is_free_memory = True

                    if is_free_memory and apply_free_memory:
                        # Do not compare. It is free memory
                        freed_page_counter += 1
                    else:
                        #get xdelta comparing self.raw
                        source_data = self.get_raw_data(ram_offset, len(data))
                        #save xdelta as DeltaItem only when it gives smaller
                        try:
                            if source_data == None:
                                raise IOError("launch memory snapshot is bigger than base vm")
                            patch = tool.diff_data(source_data, data, 2*len(source_data))
                            if len(patch) < len(data):
                                delta_item = DeltaItem(DeltaItem.DELTA_MEMORY,
                                        ram_offset, len(data),
                                        hash_value=sha256(data).digest(),
                                        ref_id=DeltaItem.REF_XDELTA,
                                        data_len=len(patch),
                                        data=patch)
                            else:
                                raise IOError("xdelta3 patch is bigger than origianl")
                        except IOError as e:
                            #LOG.info("xdelta failed, so save it as raw (%s)" % str(e))
                            delta_item = DeltaItem(DeltaItem.DELTA_MEMORY,
                                    ram_offset, len(data),
                                    hash_value=sha256(data).digest(),
                                    ref_id=DeltaItem.REF_RAW,
                                    data_len=len(data),
                                    data=data)
                        hash_list.append(delta_item)

                # memory over-usage protection
                if len(hash_list) > Memory.RAM_PAGE_SIZE*1000000: # 400MB for hashlist
                    raise MemoryError("possibly comparing with wrong base VM")
            ram_offset += len(data)
            # print progress bar for every 100 page
            if (ram_offset % (Memory.RAM_PAGE_SIZE*100)) == 0:
                prog_bar.set_percent(100.0*ram_offset/total_size)
                prog_bar.show_progress()
        prog_bar.finish()
        return freed_page_counter