def savingQuantizedDctBlocks(blocks, yLen, xLen, useHuffman, img, bitBits, runBits, rbBits, h, w): rbCount, zigZag = huffmanCounterWholeImg(blocks, yLen, xLen, h, w, bitBits, runBits, rbBits) hfm = huffman.codebook(rbCount.items()) sortedHfm = [[hfm[i[0]], i[0]] for i in rbCount.most_common()] code = "" DC = 0 for y in range(yLen): for x in range(xLen): for i in range(3): codeNew, DC = runLength(zigZag[y * xLen * 3 + x * 3 + i], DC, hfm if useHuffman else None, bitBits, runBits, rbBits) code += codeNew savedImg = runLength2bytes(code) print(str(code[:100]) + "......") print(str(savedImg[:20]) + "......") print("Image original size: %.3f MB" % (img.size / (2**20))) print("Compression image size: %.3f MB" % (len(savedImg) / 2**20)) print("Compression ratio: %.2f : 1" % (img.size / 2**20 / (len(savedImg) / 2**20))) return bytes([ int(format(xLen, '012b')[:8], 2), int(format(xLen, '012b')[8:] + format(yLen, '012b')[:4], 2), int(format(yLen, '012b')[4:], 2) ]) + savedImg, sortedHfm
def buttonListener1(self, event): root = tkinter.filedialog.askopenfilename( title="选择字典", initialdir=(os.path.expanduser(default_root)), filetypes=[("Text file", "*.txt*")]) #将字典存储 if len(root): new_file = open(root, "r") self.dict = new_file.read().split("#") #生成随机权重 if len(self.dict) > 1: print("打开字典" + root + "成功") print(self.dict) self.hasDict = True self.label = Label(self.frame, text=root, bg="#fafaff", fg="#9966CC") self.label.grid(row=0, column=1, padx=5, pady=5, sticky=(W, E)) _dict = [] for i in range(0, len(self.dict) - 1): _dict.append((self.dict[i], random.randint(0, Max))) #画哈夫曼树 tree = huffman.codebook(_dict) if (len(self.keyDict)): self.keyDict.clear() for i in tree: t = tree.get(i) self.keyDict.append((i, t)) print(self.keyDict) new_file.close() else: tkinter.messagebox.showwarning( title="打开字典失败", message="字典内容不正确,请确保字典内容格式正确且数目大于一个单词")
def Huf(inputs): output = [] for i in range(len(inputs)): output.append((inputs[i]['song_name'], inputs[i]['count'])) output = huffman.codebook(output) print(output)
def huffman_encode(s1): content = [] for i in s1: content.append(i) data_handle = list(set(content)) word_dict = {} for word_content in data_handle: word_dict[word_content] = 0 for i in s1: for key, value in word_dict.items(): if i.find(key) != -1: value = value + 1 word_dict[key] = value result = [] for key, value in word_dict.items(): result.append((key, value)) temp_content = huffman.codebook(result) print(temp_content) ans = '' for i in s1: ans = ans + temp_content[i] last_result = [] for j in range(0, len(ans), 6): last_result.append(encode_b64(ans[j:j + 6])) return last_result
def huffmanEncode(data): """Returns huffman encoded data along with codebook""" codebook = huffman.codebook(collections.Counter(data).items()) encoded = bitarray() encoded.encode(bitarrayDict(codebook), data) encoded = pack64(str(encoded)[10:-2]) return encoded, codebook
def __init__(self, dataset): # self.input = np.array_str(dataset)[1:-1] self.items = [(str(i), j) for i, j in sorted(collections.Counter(dataset).items())] self.codebook = huffman.codebook(self.items) self.codebook.pop(" ", None) self.codebook.pop("/n", None)
def generate_codes(text, verbose=False): text = text.lower() words = (groups[0] for groups in re.findall("([a-z]+(['-][a-z]+)*)", text)) words = (w for w in words if w not in STOPWORDS) word_counts = collections.Counter(words) codebook = huffman.codebook(word_counts.items()) return {w: {"word": w, "count": word_counts[w], "code": codebook[w]} for w in word_counts}
def main(): #to-do: include all symbols in the dictionary at the beginning, even if> key = chr(10) #LF NL newline, linefeed dictionary.update({key: 1}) for i in range(33, 127): key = chr(i) dictionary.update({key: 1}) #For each row in the input file for rows in fileinput.input(): for key in rows: #increment the value of occurrences if key in dictionary: dictionary[key] += 1 else: print(ord(key)) print('error not in dict') sys.exit(1) #Create the huffman tree with the dictionary items and also the dict with huffman values huff_dict = huffman.codebook(dictionary.items()) print(huffman.codebook(dictionary.items())) #get the Byte array with huffman dictionary bitArrayStr = '' for rows in fileinput.input(): for key in rows: if (key in huff_dict): bitArrayStr = bitArrayStr + huff_dict[key] #done: write the string s buffer = bytearray() i = 0 while i < len(bitArrayStr): buffer.append(int(bitArrayStr[i:i + 8], 2)) i += 8 # now write your buffer to a file with open(fileinput.filename() + ".huffman", "bw") as f: f.write(buffer) f.close()
def coding(self, _rle): # кодирование методом Хаффмана count = Counter(_rle) dictionary = huffman.codebook(count.items()) code = [] for i in _rle: code.append(dictionary[i]) return code, dictionary
def generate_codes(): numchars_w = [(x, 10) for x in numchars] symchars_w = [(x, 20) for x in symchars] short_lispnames_w = [(x, 100) for x in short_lispnames] long_lispnames_w = [(x, 1) for x in long_lispnames2] funchars_w = [(x, 9) for x in funchars] all_w = funchars_w + numchars_w + symchars_w + short_lispnames_w + long_lispnames_w codes = huffman.codebook(all_w).items() return codes
def encode(quantised_model_dict): print("\n") print("Generating Codebook...") concat = numpy.concatenate( (numpy.ravel(quantised_model_dict['classifier.1.weight'].numpy()), numpy.ravel(quantised_model_dict['classifier.4.weight'].numpy()), numpy.ravel(quantised_model_dict['classifier.6.weight'].numpy())), axis=0) huffmanUnique = numpy.unique(concat) codebook = huffman.codebook(collections.Counter(concat.tolist()).items()) print("Codebook Generated") return codebook, huffmanUnique.size
def __init__(self, input: PacketizedStream, distribution): self.input = input self.distribution = distribution self.table = { k: v[::-1] for k, v in huffman.codebook(self.distribution.items()).items() } self.max_input_word = max(self.distribution.keys()) + 1 self.max_code_len = max([len(v) for v in self.table.values()]) self.output = VariableWidthStream(self.max_code_len + 1)
def compute_huffman_coding(translations, qstrs, compression_filename): all_strings = [x[1] for x in translations] # go through each qstr and print it out for _, _, qstr in qstrs.values(): all_strings.append(qstr) all_strings_concat = "".join(all_strings) counts = collections.Counter(all_strings_concat) cb = huffman.codebook(counts.items()) values = [] length_count = {} renumbered = 0 last_l = None canonical = {} for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): values.append(ch) l = len(code) if l not in length_count: length_count[l] = 0 length_count[l] += 1 if last_l: renumbered <<= (l - last_l) canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) s = C_ESCAPES.get(ch, ch) print("//", ord(ch), s, counts[ch], canonical[ch], renumbered) renumbered += 1 last_l = l lengths = bytearray() print("// length count", length_count) for i in range(1, max(length_count) + 2): lengths.append(length_count.get(i, 0)) print("// values", values, "lengths", len(lengths), lengths) print( "// estimated total memory size", len(lengths) + 2 * len(values) + sum(len(cb[u]) for u in all_strings_concat)) print("//", values, lengths) values_type = "uint16_t" if max(ord(u) for u in values) > 255 else "uint8_t" max_translation_encoded_length = max( len(translation.encode("utf-8")) for original, translation in translations) with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join( map(str, lengths)))) f.write("const {} values[] = {{ {} }};\n".format( values_type, ", ".join(str(ord(u)) for u in values))) f.write("#define compress_max_length_bits ({})\n".format( max_translation_encoded_length.bit_length())) return values, lengths
def test_huffman(): while True: n_words = np.random.randint(1, 100) para = random_paragraph(n_words) HT = HuffmanEncoder() HT.fit(para) my_dict = HT._item2code their_dict = huffman.codebook(Counter(para).items()) for k, v in their_dict.items(): fstr = "their_dict['{}'] = {}, but my_dict['{}'] = {}" assert k in my_dict, "key `{}` not in my_dict".format(k) assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k]) print("PASSED")
def huffman_compress(seq): probs = list(Counter(seq).items()) # Calculo los pesos # Si hay un solo simbolo, huffman considera que no hay # información para codificar. Caso aparte: if len(probs) == 1: binstring = '1' * probs[0][1] huffdic_decode = {'1': probs[0][0]} return (binstring, huffdic_decode) huffdic_encode = huffman.codebook(probs) huffdic_decode = {} binstring = "" for x in seq: binstring = binstring + huffdic_encode[x] huffdic_decode[huffdic_encode[x]] = x return (binstring, huffdic_decode)
def test_encode_success_small_file_data(self): original_huffman_coding = huffman.codebook(collections.Counter(file_data_big).items()) encoded_data = AT.Encoder(file_data_big).encode() data_stream = bitstring.ConstBitStream(encoded_data) huffman_coding_num_bytes = int.from_bytes(data_stream.read('bytes:1'), byteorder='big') self.assertEqual(huffman_coding_num_bytes, 1) huffman_coding_size = int.from_bytes(data_stream.read("bytes:{}".format(huffman_coding_num_bytes)), byteorder='big') huffman_bin_data = data_stream.read("bytes:{}".format(huffman_coding_size)) unpacked_huffman_coding = msgpack.unpackb(huffman_bin_data, raw=False) self.assertEqual(original_huffman_coding, unpacked_huffman_coding) self.assertEqual(len(original_huffman_coding), len(unpacked_huffman_coding))
def generate_huffman_trees_for_windows_kmers(all_huffmans_trees): # compute the saving that is made by each tree in order to use this saving in # building the huffman tree for tree keys so that the tree key with larger saving # get smaller codeword temp = [(u, v) for u, v in all_huffmans_trees["tree_key_freq"].items()] huffman_for_trees = huffman.codebook(temp) if len(temp) == 1: all_huffmans_trees["tree_code"][temp[0][0]] = "0" else: for key in sorted(all_huffmans_trees["tree_key_freq"].items(), key=lambda x: x[1], reverse=True): all_huffmans_trees["tree_code"][key[0]] = huffman_for_trees[key[0]] return all_huffmans_trees
def test_basic(numer): a = randint(1, 9) b = randint(1, 9) c = randint(1, 9) d = randint(1, 9) e = randint(1, 9) f = randint(1, 9) g = randint(1, 9) h = randint(1, 9) i = randint(1, 9) randy = [('A', a), ('B', b), ('C', c), ('D', d), ('E', e), ('F', f), ('G', g), ('H', h), ('I', i)] inpu = [] for u in range(numer): inpu.append(randy[u]) print(inpu) if input() == "": output = huffman.codebook(inpu) print(output)
def test_counter(self): input_ = sorted(collections.Counter("man the stand banana man").items()) output = huffman.codebook(input_) expected = { " ": "111", "a": "10", "b": "0101", "d": "0110", "e": "11000", "h": "0100", "m": "0111", "n": "00", "s": "11001", "t": "1101", } self.assertEqual(output, expected)
def compute_huffman_coding(translations, qstrs, compression_filename): all_strings = [x[1] for x in translations] # go through each qstr and print it out for _, _, qstr in qstrs.values(): all_strings.append(qstr) all_strings_concat = "".join(all_strings).encode("utf-8") counts = collections.Counter(all_strings_concat) # add other values for i in range(256): if i not in counts: counts[i] = 0 cb = huffman.codebook(counts.items()) values = bytearray() length_count = {} renumbered = 0 last_l = None canonical = {} for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])): values.append(ch) l = len(code) if l not in length_count: length_count[l] = 0 length_count[l] += 1 if last_l: renumbered <<= (l - last_l) canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l) if chr(ch) in C_ESCAPES: s = C_ESCAPES[chr(ch)] else: s = chr(ch) print("//", ch, s, counts[ch], canonical[ch], renumbered) renumbered += 1 last_l = l lengths = bytearray() for i in range(1, max(length_count) + 1): lengths.append(length_count.get(i, 0)) print("//", values, lengths) with open(compression_filename, "w") as f: f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join( map(str, lengths)))) f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join( map(str, values)))) return values, lengths
def test_counter(self): input_ = sorted( collections.Counter("man the stand banana man").items()) output = huffman.codebook(input_) expected = { " ": "111", "a": "10", "b": "0101", "d": "0110", "e": "11000", "h": "0100", "m": "0111", "n": "00", "s": "11001", "t": "1101", } self.assertEqual(output, expected)
def test_counter(self): input_ = sorted( collections.Counter('man the stand banana man').items()) output = huffman.codebook(input_) expected = { ' ': '111', 'a': '10', 'b': '0101', 'd': '0110', 'e': '11000', 'h': '0100', 'm': '0111', 'n': '00', 's': '11001', 't': '1101', } self.assertEqual(output, expected)
def generate_huffman_tables(symbol_frequencies, levels, input_range, quantization, max_table_size=1024): to_return = {} for rc, frequencies in symbol_frequencies.items(): nr = numeric_range_from_region_code_with_rle(rc, levels, input_range, quantization) symbols = np.arange(nr.min, nr.max + 1, dtype=ty) if rc == 1: cb = {} else: sorting_indecies = np.argsort(frequencies) escape_symbol = nr.max + 1 real_symbols = np.append(symbols[sorting_indecies[-max_table_size:]], [escape_symbol]) escape_frequency = np.sum(frequencies[:-max_table_size]) real_frequencies = np.append(frequencies[sorting_indecies[-max_table_size:]], [escape_frequency]) cb = codebook(zip(real_symbols, real_frequencies)) to_return[rc] = {k: bitarray(v) for k, v in cb.items()} return to_return
def minimum_total_bits_codes(): min_total_num_bits = 1000000 min_codes = [] r = [100, 200, 300, 400, 500, 600, 700] space = [(x, y, z, v) for x in r for y in r for z in r for v in r] #for w in r] for point in space: numchars_w = [(x, (point[0] * 10) / len(numchars)) for x in numchars] symchars_w = [(x, (point[1] * 5) / len(symchars)) for x in symchars] funchars_w = [(x, (point[2] * 20) / len(funchars)) for x in funchars] lispnames_w = [(x, (point[3] * 15) / len(lispnames)) for x in lispnames] all_w = symchars_w + lispnames_w + funchars_w + numchars_w codes = huffman.codebook(all_w).items() size = total_bits(codes) # print("smallest: % d current: % d\n" % (min_total_num_bits, size) ) if size < min_total_num_bits: min_total_num_bits = size min_codes = codes return (min_total_num_bits, min_codes)
def huffman_compression(error_strings): # https://github.com/tannewt/huffman import huffman all_strings = "".join(error_strings) cb = huffman.codebook(collections.Counter(all_strings).items()) for line in error_strings: b = "1" for c in line: b += cb[c] n = len(b) if n % 8 != 0: n += 8 - (n % 8) result = "" for i in range(0, n, 8): result += "\\{:03o}".format(int(b[i:i + 8], 2)) if len(result) > len(line) * 4: result = line error_strings[line] = result # TODO: This would be the prefix lengths and the table ordering. return "_" * (10 + len(cb))
def compress(filename, args ): print( 'Generating codebook') with open(filename, 'rb') as f: txt = f.read().decode('utf8', 'ignore') freq = Counter( txt ) codebook = huffman.codebook(freq.items()) avgCode = 0.0 print( 'Compressing files') enc_ = bitarray.bitarray( ) for i, a in enumerate(txt): code = codebook[a] avgCode = (avgCode * i + len(code) ) / (i+1) enc_.extend(code) print(f'Average code length {avgCode:.3f}') print( ' .. done.' ) # Write to compressed file. Add the codebook as well. This does not change # the compression ration very much. outfile = '%s.dx' % args.file with open( outfile, 'wb' ) as fo: revCodeBook = dict((v, k) for k, v in codebook.items()) codebookStr = str(revCodeBook).encode() fo.write(codebookStr) fo.write(delim_) fo.write(enc_.tobytes()) bestCode = entropy( freq ) print( 'Average codeword length : %f' % avgCode ) print( '| Optimal average code length: %f' % bestCode ) print( 'Compressed files is written to %s' % outfile ) s1, s2 = map( os.path.getsize, [ args.file, outfile ] ) print( '| Original file size : %d' % s1 ) print( '| Compressed file size : %d' % s2 ) print( '| Compression ratio : %f' % (s1 / float( s2 ) ) )
def encodeFile(fileName): try: file = open(fileName, "r") except: print("Invalid filename") return "" fileContents = file.read().rstrip() freq = Counter(re.sub("[^A-Za-z]", "", fileContents)).items() codec = huffman.codebook(freq) output = "" for key in codec: codec[key] = codec[key].replace('0', 'g').replace('1', 'G') output += "%s " % key output += "%s " % codec[key] output += "\n" for ch in list(fileContents): try: output += codec[ch] except: output += ch return (output)
def test_basic(self): output = huffman.codebook([("A", 2), ("B", 4), ("C", 1), ("D", 1)]) expected = {"A": "10", "B": "0", "C": "110", "D": "111"} self.assertEqual(output, expected)
p[4] = p[4] + 1 for i in range(0, 5): p[i] = p[i] / 20 return p probabilidad = prob(x) print("probabilidad de aparición de cada simbolo: ") print(probabilidad) print("\n") #5 huffVec = ([('1', probabilidad[0] * 20), ('2', probabilidad[1] * 20), ('3', probabilidad[2] * 20), ('4', probabilidad[3] * 20), ('5', probabilidad[4] * 20)]) xH = hf.codebook(huffVec) print("Codigo Huffman: ") print(xH) longT = len(xH['1']) + len(xH['2']) + len(xH['3']) + len(xH['4']) + len( xH['5']) longP = longT / 5 print("Longitud pormedio por simbolo: " + str(longP)) print("\n") #6 cadH = ["" for x in range(len(x))] longT = 0 for i in range(0, len(x)): if (x[i] == 1): cadH[i] = xH['1'] elif (x[i] == 2):
def get_huffman_codebook(big_line): freqs = get_frequencies(big_line) huff_code = huffman.codebook(freqs) return huff_code
def doHuffman(prob): """ Input: prob [((level, run, last), probability), (xxx), (xxx), ...] """ return huffman.codebook([(x, prob[x]) for x in prob])
def make_dict(self): tmp = [] for i in range(len(self.hist)): tmp.append((str(i), self.hist[i])) self.dict = huffman.codebook(tmp) self.inv_dict = {v: k for k, v in self.dict.items()}
import huffman from utils.functions import * data = read_from_origin() sdata, _ = split_data(data) count_ = {} for i in range(len(sdata)): if sdata[i] not in count_: count_[sdata[i]] = 1 else: count_[sdata[i]] = count_[sdata[i]] + 1 codebook_ = convert_to_codebook(count_) print(codebook_) dic = huffman.codebook(codebook_) str_ = "" for i in range(len(sdata)): str_ = str_ + dic[sdata[i]] print(len(str_)) print(str_) f = open("huffman.txt", "w+") f.write(str_)