def test_build_tree(): string = "abbb" tree = build_tree(string) assert isinstance(tree, Node) assert tree[1] is None assert tree[2][1] == 'a' assert tree[3][1] == 'b'
def test_decoding_real_world(self): frequency_map = get_real_world_input() tree = build_tree(frequency_map) text = get_real_world_text() encoded = encode(text, tree) decoded = decode(encoded, tree) self.assertEqual(text, decoded)
def test_decoding(self): frequency_map = get_wikipedia_input() tree = build_tree(frequency_map) samples = ['a', 'abc', 'adeaddadcededabadbabeabeadedabacabed'] for text in samples: encoded = encode(text, tree) decoded = decode(encoded, tree) self.assertEqual(text, decoded)
def test_encoding(self): frequency_map = get_wikipedia_input() tree = build_tree(frequency_map) encoded = encode('adeaddadcededabadbabeabeadedabacabed', tree) self.assertEqual( encoded.to01(), '01000100100000010001110001000010011010000110100111001001110010001000010011010110100111000' )
def compress_image(input_file): before = time.time() path = os.path.dirname(__file__) + "/images/" + input_file print("Compressing %s ..." % input_file) image = Image.open(path) print("Image is %d x %d" % (image.size[0], image.size[1])) img_raw_size = huffman.raw_size(image.size[0], image.size[1]) print("Image size is: %d byte." % img_raw_size) data = np.asarray(image) hilbert_array = hilbert_compression(data) hilbert_array = hilbert_array.astype('uint8') print("Counting symbols...") counts = huffman.count_hilbert(hilbert_array) print("Building tree...") tree = huffman.build_tree(counts) print("Trimming tree...") trimmed_tree = huffman.trim_tree(tree) print("Assigning codes to pixels...") codes = huffman.assign_codes(trimmed_tree) estimated_size = huffman.compressed_size(counts, codes) print("Estimated size: %d bytes" % estimated_size) output_path = os.path.dirname( __file__) + "/output/" + input_file[:-3] + "txt" print("Writing to %s..." % (input_file[:-3] + "txt")) stream = OutputBitStream(output_path) print("Encoding header...") huffman.encode_header(image, stream) stream.flush_buffer() print("Encoding tree...") huffman.encode_tree(trimmed_tree, stream) stream.flush_buffer() print("Encoding pixels...") huffman.encode_hilbert(hilbert_array, codes, stream) stream.close() after = time.time() real_size = stream.bytes_written print("Wrote %d bytes." % real_size) print("Estimate is %scorrect." % ('' if estimated_size == real_size else 'in')) print("Compression ratio: %0.2f" % (float(img_raw_size) / real_size)) print("Took %0.2f seconds." % (after - before)) return hilbert_array
def decode(filename_in, filename_out): with open(filename_in, 'rb') as fi: n_header = int.from_bytes(fi.read(8), 'big') u = int.from_bytes(fi.read(1), 'big') header = fi.read(n_header) data = fi.read() freq = pickle.loads(header) tree = huffman.build_tree(freq) map_code = huffman.build_map_code(tree) out = huffman.decode(data, map_code, u) with open(filename_out, 'wb') as fo: fo.write(out)
def test_building_encoding_for_uniform(self): frequency_map = get_uniform_input() tree = build_tree(frequency_map) lookup = build_lookup(tree) # Human readable lookup_readable = {k: bits.to01() for k, bits in lookup.items()} self.assertEqual(lookup_readable, { 'a': '110', 'b': '111', 'c': '00', 'd': '01', 'e': '10' })
def encode(filename_in, filename_out): with open(filename_in, 'rb') as fi: freq = huffman.freq_str(read_each(fi)) tree = huffman.build_tree(freq) map_code = huffman.build_map_code(tree) fi.seek(0) out = huffman.encode(read_each(fi), map_code) u = out.buffer_info()[3] # unused bits of last byte header = pickle.dumps(freq, pickle.HIGHEST_PROTOCOL) n_header = len(header) with open(filename_out, 'wb') as fo: fo.write( n_header.to_bytes(8, 'big') + u.to_bytes(1, 'big') + header + out.tobytes())
number_of_symbols = 10 decay_rate = 0.01 # Generates different random upper case ASCII characters as specified by `number_of_symbols` random_symbols = huff.generate_n_random_symbols(number_of_symbols) freqs = huff.generate_random_symbol_frequencies(type_of_distribution, number_of_realization, number_of_symbols, decay_rate) input_text = huff.generate_random_text_with_predefined_frequencies( freqs, random_symbols, 200) # Calculate frequency of symbols symbols_and_freqs, unique_symbols = huff.calculate_frequency_of_symbols_from_text( input_text) # Build Tree tree = huff.build_tree(symbols_and_freqs) # Traverse tree and build dictionary # `tree_type=0` means, left -> 0 and right -> 1. `tree_type=1` means, left -> 1 and right -> 0. dictionary = huff.traverse_tree(tree, unique_symbols, tree_type=0) # Encode the input text based on created dictionary encoded_text = huff.encode_message(dictionary, input_text) # Decode the encoded text based on created dictionary decoded_text = huff.decode_message(encoded_text, dictionary) # Create reports huff.produce_huffman_report(symbols_and_freqs, dictionary) huff.huffman_coded_string_report(input_text, encoded_text, symbols_and_freqs, dictionary) huff.huffman_decoded_string_report(encoded_text, decoded_text, symbols_and_freqs, dictionary)
frequency_map[c] += 1 ccount += 1 text += line words += [w.strip(' \n.,”“') for w in line.split()] print('= Stats =') print('Number of characters', ccount) print('Number of words', len(words)) min_entropy = compute_entropy(frequency_map.values()) print('Minimum entropy', min_entropy) huffman_entropy = compute_huffman_entropy(frequency_map) print('Huffman entropy', huffman_entropy) tree = build_tree(frequency_map) encoded_text = encode(text, tree) print('Length of raw text: {} bytes'.format(len(text))) print('Length of encoded text: {} bytes'.format(len(encoded_text)/8)) print('Compression rate: {}'.format(len(text)*8/len(encoded_text))) print('= Word-based =') text_length = 0 frequency_map = {} for w in words: text_length += len(w) if w not in frequency_map: frequency_map[w] = 0 frequency_map[w] += 1 avg_word_size = text_length / len(words)
def test_build_tree_empty(): string = "" tree = build_tree(string) assert tree is None
from pprint import pprint import huffman from view import viz_tree data = b"huffman" tree = huffman.build_tree(data) map_code = huffman.build_map_code(tree) # encode bin_data = huffman.encode(data, map_code) print("Map code") for k, v in map_code.items(): print("{}: {}".format(chr(k), v.to01())) print("Encoded data") print(bin_data.to01()) viz_tree(tree) # decode print("After decode") print(huffman.decode(bin_data.tobytes(), map_code, bin_data.buffer_info()[3])) # calculate performance p = len(bin_data) / (len(data) * 8) print(f"Reduce {p * 100}%")