Example #1
0
def hashing(disk_path, meta_path, chunk_size=4096, window_size=512):
    # TODO: need more efficient implementation, e.g. bisect
    # generate hash of base disk
    # disk_path : raw disk path
    # chunk_size : hash chunk size
    # window_size : slicing window size

    prog_bar = AnimatedProgressBar(end=100, width=80, stdout=sys.stdout)
    total_iteration = os.path.getsize(disk_path)/window_size
    iter_count = 0
    prog_interval = 100

    disk_file = open(disk_path, "rb")
    out_file = open(meta_path, "w+b")
    data = disk_file.read(chunk_size)
    if (not data) or len(data) < chunk_size:
        raise DiskError("invalid raw disk size")

    entire_hashing = sha256()
    entire_hashing.update(data)

    s_offset = 0
    data_len = len(data)
    hash_dic = dict()
    while True:
        if (iter_count)%prog_interval == 0:
            prog_bar.process(100.0*prog_interval/total_iteration)
            prog_bar.show_progress()
        iter_count += 1

        hashed_data = sha256(data).digest()
        if hash_dic.get(hashed_data) == None:
            hash_dic[hashed_data]= (hashed_data, s_offset, data_len)

        added_data = disk_file.read(window_size)
        if (not added_data) or len(added_data) != window_size:
            break
        s_offset += window_size
        data = data[window_size:] + added_data
        entire_hashing.update(added_data)

    for hashed_data, s_offset, data_len in list(hash_dic.values()):
        out_file.write(struct.pack("!QI%ds" % len(hashed_data), 
            s_offset, data_len, hashed_data))
    disk_file.close()
    out_file.close()

    return entire_hashing.hexdigest()