Ejemplo n.º 1
0
def savingQuantizedDctBlocks(blocks, yLen, xLen, useHuffman, img, bitBits,
                             runBits, rbBits, h, w):
    rbCount, zigZag = huffmanCounterWholeImg(blocks, yLen, xLen, h, w, bitBits,
                                             runBits, rbBits)
    hfm = huffman.codebook(rbCount.items())
    sortedHfm = [[hfm[i[0]], i[0]] for i in rbCount.most_common()]
    code = ""
    DC = 0
    for y in range(yLen):
        for x in range(xLen):
            for i in range(3):
                codeNew, DC = runLength(zigZag[y * xLen * 3 + x * 3 + i], DC,
                                        hfm if useHuffman else None, bitBits,
                                        runBits, rbBits)
                code += codeNew
    savedImg = runLength2bytes(code)
    print(str(code[:100]) + "......")
    print(str(savedImg[:20]) + "......")
    print("Image original size:    %.3f MB" % (img.size / (2**20)))
    print("Compression image size: %.3f MB" % (len(savedImg) / 2**20))
    print("Compression ratio:      %.2f : 1" % (img.size / 2**20 /
                                                (len(savedImg) / 2**20)))
    return bytes([
        int(format(xLen, '012b')[:8], 2),
        int(format(xLen, '012b')[8:] + format(yLen, '012b')[:4], 2),
        int(format(yLen, '012b')[4:], 2)
    ]) + savedImg, sortedHfm
Ejemplo n.º 2
0
 def buttonListener1(self, event):
     root = tkinter.filedialog.askopenfilename(
         title="选择字典",
         initialdir=(os.path.expanduser(default_root)),
         filetypes=[("Text file", "*.txt*")])
     #将字典存储
     if len(root):
         new_file = open(root, "r")
         self.dict = new_file.read().split("#")
         #生成随机权重
         if len(self.dict) > 1:
             print("打开字典" + root + "成功")
             print(self.dict)
             self.hasDict = True
             self.label = Label(self.frame,
                                text=root,
                                bg="#fafaff",
                                fg="#9966CC")
             self.label.grid(row=0, column=1, padx=5, pady=5, sticky=(W, E))
             _dict = []
             for i in range(0, len(self.dict) - 1):
                 _dict.append((self.dict[i], random.randint(0, Max)))
             #画哈夫曼树
             tree = huffman.codebook(_dict)
             if (len(self.keyDict)):
                 self.keyDict.clear()
             for i in tree:
                 t = tree.get(i)
                 self.keyDict.append((i, t))
             print(self.keyDict)
             new_file.close()
         else:
             tkinter.messagebox.showwarning(
                 title="打开字典失败", message="字典内容不正确,请确保字典内容格式正确且数目大于一个单词")
Ejemplo n.º 3
0
def Huf(inputs):
    output = []
    for i in range(len(inputs)):
        output.append((inputs[i]['song_name'], inputs[i]['count']))

    output = huffman.codebook(output)
    print(output)
Ejemplo n.º 4
0
def huffman_encode(s1):
    content = []
    for i in s1:
        content.append(i)
    data_handle = list(set(content))
    word_dict = {}
    for word_content in data_handle:
        word_dict[word_content] = 0
    for i in s1:
        for key, value in word_dict.items():
            if i.find(key) != -1:
                value = value + 1
            word_dict[key] = value
    result = []
    for key, value in word_dict.items():
        result.append((key, value))
    temp_content = huffman.codebook(result)
    print(temp_content)
    ans = ''
    for i in s1:
        ans = ans + temp_content[i]
    last_result = []
    for j in range(0, len(ans), 6):
        last_result.append(encode_b64(ans[j:j + 6]))
    return last_result
Ejemplo n.º 5
0
def huffmanEncode(data):
    """Returns huffman encoded data along with codebook"""
    codebook = huffman.codebook(collections.Counter(data).items())
    encoded = bitarray()
    encoded.encode(bitarrayDict(codebook), data)
    encoded = pack64(str(encoded)[10:-2])
    return encoded, codebook
Ejemplo n.º 6
0
    def __init__(self, dataset):
        # self.input = np.array_str(dataset)[1:-1]
        self.items = [(str(i), j)
                      for i, j in sorted(collections.Counter(dataset).items())]
        self.codebook = huffman.codebook(self.items)

        self.codebook.pop(" ", None)
        self.codebook.pop("/n", None)
Ejemplo n.º 7
0
def generate_codes(text, verbose=False):
    text = text.lower()

    words = (groups[0] for groups in re.findall("([a-z]+(['-][a-z]+)*)", text))
    words = (w for w in words if w not in STOPWORDS)
    word_counts = collections.Counter(words)

    codebook = huffman.codebook(word_counts.items())

    return {w: {"word": w, "count": word_counts[w], "code": codebook[w]} for w in word_counts}
def main():
    #to-do: include all symbols in the dictionary at the beginning, even if>
    key = chr(10)  #LF NL newline, linefeed
    dictionary.update({key: 1})
    for i in range(33, 127):
        key = chr(i)
        dictionary.update({key: 1})

#For each row in the input file
    for rows in fileinput.input():
        for key in rows:
            #increment the value of occurrences
            if key in dictionary:
                dictionary[key] += 1
            else:
                print(ord(key))
                print('error not in dict')
                sys.exit(1)

#Create the huffman tree with the dictionary items and also the dict with huffman values
    huff_dict = huffman.codebook(dictionary.items())
    print(huffman.codebook(dictionary.items()))

    #get the Byte array with huffman dictionary
    bitArrayStr = ''
    for rows in fileinput.input():
        for key in rows:
            if (key in huff_dict):
                bitArrayStr = bitArrayStr + huff_dict[key]


#done: write the string s
    buffer = bytearray()
    i = 0
    while i < len(bitArrayStr):
        buffer.append(int(bitArrayStr[i:i + 8], 2))
        i += 8

    # now write your buffer to a file
    with open(fileinput.filename() + ".huffman", "bw") as f:
        f.write(buffer)

    f.close()
Ejemplo n.º 9
0
    def coding(self, _rle):
        # кодирование методом Хаффмана
        count = Counter(_rle)
        dictionary = huffman.codebook(count.items())

        code = []
        for i in _rle:
            code.append(dictionary[i])

        return code, dictionary
Ejemplo n.º 10
0
def generate_codes():
    numchars_w = [(x, 10) for x in numchars]
    symchars_w = [(x, 20) for x in symchars]
    short_lispnames_w = [(x, 100) for x in short_lispnames]
    long_lispnames_w = [(x, 1) for x in long_lispnames2]
    funchars_w = [(x, 9) for x in funchars]

    all_w = funchars_w + numchars_w + symchars_w + short_lispnames_w + long_lispnames_w
    codes = huffman.codebook(all_w).items()

    return codes
Ejemplo n.º 11
0
def encode(quantised_model_dict):
    print("\n")
    print("Generating Codebook...")
    concat = numpy.concatenate(
        (numpy.ravel(quantised_model_dict['classifier.1.weight'].numpy()),
         numpy.ravel(quantised_model_dict['classifier.4.weight'].numpy()),
         numpy.ravel(quantised_model_dict['classifier.6.weight'].numpy())),
        axis=0)
    huffmanUnique = numpy.unique(concat)
    codebook = huffman.codebook(collections.Counter(concat.tolist()).items())
    print("Codebook Generated")
    return codebook, huffmanUnique.size
Ejemplo n.º 12
0
    def __init__(self, input: PacketizedStream, distribution):
        self.input = input

        self.distribution = distribution
        self.table = {
            k: v[::-1]
            for k, v in huffman.codebook(self.distribution.items()).items()
        }

        self.max_input_word = max(self.distribution.keys()) + 1
        self.max_code_len = max([len(v) for v in self.table.values()])

        self.output = VariableWidthStream(self.max_code_len + 1)
Ejemplo n.º 13
0
def compute_huffman_coding(translations, qstrs, compression_filename):
    all_strings = [x[1] for x in translations]

    # go through each qstr and print it out
    for _, _, qstr in qstrs.values():
        all_strings.append(qstr)
    all_strings_concat = "".join(all_strings)
    counts = collections.Counter(all_strings_concat)
    cb = huffman.codebook(counts.items())
    values = []
    length_count = {}
    renumbered = 0
    last_l = None
    canonical = {}
    for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
        values.append(ch)
        l = len(code)
        if l not in length_count:
            length_count[l] = 0
        length_count[l] += 1
        if last_l:
            renumbered <<= (l - last_l)
        canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
        s = C_ESCAPES.get(ch, ch)
        print("//", ord(ch), s, counts[ch], canonical[ch], renumbered)
        renumbered += 1
        last_l = l
    lengths = bytearray()
    print("// length count", length_count)
    for i in range(1, max(length_count) + 2):
        lengths.append(length_count.get(i, 0))
    print("// values", values, "lengths", len(lengths), lengths)
    print(
        "// estimated total memory size",
        len(lengths) + 2 * len(values) +
        sum(len(cb[u]) for u in all_strings_concat))
    print("//", values, lengths)
    values_type = "uint16_t" if max(ord(u)
                                    for u in values) > 255 else "uint8_t"
    max_translation_encoded_length = max(
        len(translation.encode("utf-8"))
        for original, translation in translations)
    with open(compression_filename, "w") as f:
        f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(
            map(str, lengths))))
        f.write("const {} values[] = {{ {} }};\n".format(
            values_type, ", ".join(str(ord(u)) for u in values)))
        f.write("#define compress_max_length_bits ({})\n".format(
            max_translation_encoded_length.bit_length()))
    return values, lengths
Ejemplo n.º 14
0
def test_huffman():
    while True:
        n_words = np.random.randint(1, 100)
        para = random_paragraph(n_words)
        HT = HuffmanEncoder()
        HT.fit(para)
        my_dict = HT._item2code
        their_dict = huffman.codebook(Counter(para).items())

        for k, v in their_dict.items():
            fstr = "their_dict['{}'] = {}, but my_dict['{}'] = {}"
            assert k in my_dict, "key `{}` not in my_dict".format(k)
            assert my_dict[k] == v, fstr.format(k, v, k, my_dict[k])
        print("PASSED")
Ejemplo n.º 15
0
def huffman_compress(seq):
    probs = list(Counter(seq).items())  # Calculo los pesos
    # Si hay un solo simbolo, huffman considera que no hay
    # información para codificar. Caso aparte:
    if len(probs) == 1:
        binstring = '1' * probs[0][1]
        huffdic_decode = {'1': probs[0][0]}
        return (binstring, huffdic_decode)
    huffdic_encode = huffman.codebook(probs)
    huffdic_decode = {}
    binstring = ""
    for x in seq:
        binstring = binstring + huffdic_encode[x]
        huffdic_decode[huffdic_encode[x]] = x
    return (binstring, huffdic_decode)
Ejemplo n.º 16
0
    def test_encode_success_small_file_data(self):
        original_huffman_coding = huffman.codebook(collections.Counter(file_data_big).items())
        encoded_data = AT.Encoder(file_data_big).encode()

        data_stream = bitstring.ConstBitStream(encoded_data)

        huffman_coding_num_bytes = int.from_bytes(data_stream.read('bytes:1'), byteorder='big')
        self.assertEqual(huffman_coding_num_bytes, 1)

        huffman_coding_size = int.from_bytes(data_stream.read("bytes:{}".format(huffman_coding_num_bytes)), byteorder='big')
        huffman_bin_data = data_stream.read("bytes:{}".format(huffman_coding_size))
        unpacked_huffman_coding = msgpack.unpackb(huffman_bin_data, raw=False)

        self.assertEqual(original_huffman_coding, unpacked_huffman_coding)
        self.assertEqual(len(original_huffman_coding), len(unpacked_huffman_coding))
Ejemplo n.º 17
0
def generate_huffman_trees_for_windows_kmers(all_huffmans_trees):
    # compute the saving that is made by each tree in order to use this saving in
    # building the huffman tree for tree keys so that the tree key with larger saving
    # get smaller codeword
    temp = [(u, v) for u, v in all_huffmans_trees["tree_key_freq"].items()]
    huffman_for_trees = huffman.codebook(temp)

    if len(temp) == 1:
        all_huffmans_trees["tree_code"][temp[0][0]] = "0"
    else:
        for key in sorted(all_huffmans_trees["tree_key_freq"].items(),
                          key=lambda x: x[1],
                          reverse=True):
            all_huffmans_trees["tree_code"][key[0]] = huffman_for_trees[key[0]]

    return all_huffmans_trees
Ejemplo n.º 18
0
def test_basic(numer):
    a = randint(1, 9)
    b = randint(1, 9)
    c = randint(1, 9)
    d = randint(1, 9)
    e = randint(1, 9)
    f = randint(1, 9)
    g = randint(1, 9)
    h = randint(1, 9)
    i = randint(1, 9)
    randy = [('A', a), ('B', b), ('C', c), ('D', d), ('E', e), ('F', f), ('G', g), ('H', h), ('I', i)]
    inpu = []
    for u in range(numer):
        inpu.append(randy[u])
    print(inpu)
    if input() == "":
        output = huffman.codebook(inpu)
        print(output)
Ejemplo n.º 19
0
    def test_counter(self):
        input_ = sorted(collections.Counter("man the stand banana man").items())

        output = huffman.codebook(input_)
        expected = {
            " ": "111",
            "a": "10",
            "b": "0101",
            "d": "0110",
            "e": "11000",
            "h": "0100",
            "m": "0111",
            "n": "00",
            "s": "11001",
            "t": "1101",
        }

        self.assertEqual(output, expected)
Ejemplo n.º 20
0
def compute_huffman_coding(translations, qstrs, compression_filename):
    all_strings = [x[1] for x in translations]

    # go through each qstr and print it out
    for _, _, qstr in qstrs.values():
        all_strings.append(qstr)
    all_strings_concat = "".join(all_strings).encode("utf-8")
    counts = collections.Counter(all_strings_concat)
    # add other values
    for i in range(256):
        if i not in counts:
            counts[i] = 0
    cb = huffman.codebook(counts.items())
    values = bytearray()
    length_count = {}
    renumbered = 0
    last_l = None
    canonical = {}
    for ch, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
        values.append(ch)
        l = len(code)
        if l not in length_count:
            length_count[l] = 0
        length_count[l] += 1
        if last_l:
            renumbered <<= (l - last_l)
        canonical[ch] = '{0:0{width}b}'.format(renumbered, width=l)
        if chr(ch) in C_ESCAPES:
            s = C_ESCAPES[chr(ch)]
        else:
            s = chr(ch)
        print("//", ch, s, counts[ch], canonical[ch], renumbered)
        renumbered += 1
        last_l = l
    lengths = bytearray()
    for i in range(1, max(length_count) + 1):
        lengths.append(length_count.get(i, 0))
    print("//", values, lengths)
    with open(compression_filename, "w") as f:
        f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(
            map(str, lengths))))
        f.write("const uint8_t values[256] = {{ {} }};\n".format(", ".join(
            map(str, values))))
    return values, lengths
Ejemplo n.º 21
0
    def test_counter(self):
        input_ = sorted(
            collections.Counter("man the stand banana man").items())

        output = huffman.codebook(input_)
        expected = {
            " ": "111",
            "a": "10",
            "b": "0101",
            "d": "0110",
            "e": "11000",
            "h": "0100",
            "m": "0111",
            "n": "00",
            "s": "11001",
            "t": "1101",
        }

        self.assertEqual(output, expected)
Ejemplo n.º 22
0
    def test_counter(self):
        input_ = sorted(
            collections.Counter('man the stand banana man').items())

        output = huffman.codebook(input_)
        expected = {
            ' ': '111',
            'a': '10',
            'b': '0101',
            'd': '0110',
            'e': '11000',
            'h': '0100',
            'm': '0111',
            'n': '00',
            's': '11001',
            't': '1101',
        }

        self.assertEqual(output, expected)
Ejemplo n.º 23
0
def generate_huffman_tables(symbol_frequencies, levels, input_range, quantization, max_table_size=1024):
    to_return = {}
    for rc, frequencies in symbol_frequencies.items():
        nr = numeric_range_from_region_code_with_rle(rc, levels, input_range, quantization)
        symbols = np.arange(nr.min, nr.max + 1, dtype=ty)
        if rc == 1:
            cb = {}
        else:
            sorting_indecies = np.argsort(frequencies)

            escape_symbol = nr.max + 1
            real_symbols = np.append(symbols[sorting_indecies[-max_table_size:]], [escape_symbol])

            escape_frequency = np.sum(frequencies[:-max_table_size])
            real_frequencies = np.append(frequencies[sorting_indecies[-max_table_size:]], [escape_frequency])

            cb = codebook(zip(real_symbols, real_frequencies))

        to_return[rc] = {k: bitarray(v) for k, v in cb.items()}

    return to_return
Ejemplo n.º 24
0
def minimum_total_bits_codes():
    min_total_num_bits = 1000000
    min_codes = []
    r = [100, 200, 300, 400, 500, 600, 700]
    space = [(x, y, z, v) for x in r for y in r for z in r for v in r]
    #for w in r]
    for point in space:

        numchars_w = [(x, (point[0] * 10) / len(numchars)) for x in numchars]
        symchars_w = [(x, (point[1] * 5) / len(symchars)) for x in symchars]
        funchars_w = [(x, (point[2] * 20) / len(funchars)) for x in funchars]
        lispnames_w = [(x, (point[3] * 15) / len(lispnames))
                       for x in lispnames]
        all_w = symchars_w + lispnames_w + funchars_w + numchars_w
        codes = huffman.codebook(all_w).items()
        size = total_bits(codes)
        # print("smallest: % d current: % d\n" % (min_total_num_bits, size)  )
        if size < min_total_num_bits:
            min_total_num_bits = size
            min_codes = codes

    return (min_total_num_bits, min_codes)
Ejemplo n.º 25
0
def huffman_compression(error_strings):
    # https://github.com/tannewt/huffman
    import huffman

    all_strings = "".join(error_strings)
    cb = huffman.codebook(collections.Counter(all_strings).items())

    for line in error_strings:
        b = "1"
        for c in line:
            b += cb[c]
        n = len(b)
        if n % 8 != 0:
            n += 8 - (n % 8)
        result = ""
        for i in range(0, n, 8):
            result += "\\{:03o}".format(int(b[i:i + 8], 2))
        if len(result) > len(line) * 4:
            result = line
        error_strings[line] = result

    # TODO: This would be the prefix lengths and the table ordering.
    return "_" * (10 + len(cb))
Ejemplo n.º 26
0
def compress(filename, args ):
    print( 'Generating codebook')
    with open(filename, 'rb') as f: 
        txt = f.read().decode('utf8', 'ignore')
    freq = Counter( txt )
    codebook = huffman.codebook(freq.items())
    avgCode = 0.0
    print( 'Compressing files')
    enc_ = bitarray.bitarray( )
    for i, a in enumerate(txt):
        code = codebook[a]
        avgCode = (avgCode * i + len(code) ) / (i+1)
        enc_.extend(code)
    print(f'Average code length {avgCode:.3f}')
    print( '      .. done.' )

    # Write to compressed file. Add the codebook as well. This does not change
    # the compression ration very much.
    outfile = '%s.dx' % args.file 
    with open( outfile, 'wb' ) as fo:
        revCodeBook = dict((v, k) for k, v in codebook.items())
        codebookStr = str(revCodeBook).encode()
        fo.write(codebookStr)
        fo.write(delim_)
        fo.write(enc_.tobytes())

    bestCode = entropy( freq )
    print( 'Average codeword length      : %f' % avgCode )
    print( '| Optimal average code length: %f' % bestCode )

    print( 'Compressed files is written to %s' % outfile )

    s1, s2 = map( os.path.getsize, [ args.file, outfile ] )
    print( '| Original   file size : %d' % s1 )
    print( '| Compressed file size : %d' % s2 )
    print( '| Compression ratio    : %f' % (s1 / float( s2 ) ) )
Ejemplo n.º 27
0
def encodeFile(fileName):
    try:
        file = open(fileName, "r")
    except:
        print("Invalid filename")
        return ""
    fileContents = file.read().rstrip()
    freq = Counter(re.sub("[^A-Za-z]", "", fileContents)).items()
    codec = huffman.codebook(freq)

    output = ""
    for key in codec:
        codec[key] = codec[key].replace('0', 'g').replace('1', 'G')
        output += "%s " % key
        output += "%s " % codec[key]

    output += "\n"

    for ch in list(fileContents):
        try:
            output += codec[ch]
        except:
            output += ch
    return (output)
Ejemplo n.º 28
0
    def test_basic(self):
        output = huffman.codebook([("A", 2), ("B", 4), ("C", 1), ("D", 1)])
        expected = {"A": "10", "B": "0", "C": "110", "D": "111"}

        self.assertEqual(output, expected)
Ejemplo n.º 29
0
            p[4] = p[4] + 1
    for i in range(0, 5):
        p[i] = p[i] / 20
    return p


probabilidad = prob(x)
print("probabilidad de aparición de cada simbolo: ")
print(probabilidad)
print("\n")

#5
huffVec = ([('1', probabilidad[0] * 20), ('2', probabilidad[1] * 20),
            ('3', probabilidad[2] * 20), ('4', probabilidad[3] * 20),
            ('5', probabilidad[4] * 20)])
xH = hf.codebook(huffVec)
print("Codigo Huffman: ")
print(xH)
longT = len(xH['1']) + len(xH['2']) + len(xH['3']) + len(xH['4']) + len(
    xH['5'])
longP = longT / 5
print("Longitud pormedio por simbolo: " + str(longP))
print("\n")

#6
cadH = ["" for x in range(len(x))]
longT = 0
for i in range(0, len(x)):
    if (x[i] == 1):
        cadH[i] = xH['1']
    elif (x[i] == 2):
Ejemplo n.º 30
0
def get_huffman_codebook(big_line):
    freqs = get_frequencies(big_line)
    huff_code = huffman.codebook(freqs)
    return huff_code
Ejemplo n.º 31
0
def doHuffman(prob):
    """
		Input: prob [((level, run, last), probability), (xxx), (xxx), ...]
	"""
    return huffman.codebook([(x, prob[x]) for x in prob])
Ejemplo n.º 32
0
 def make_dict(self):
     tmp = []
     for i in range(len(self.hist)):
         tmp.append((str(i), self.hist[i]))
     self.dict = huffman.codebook(tmp)
     self.inv_dict = {v: k for k, v in self.dict.items()}
Ejemplo n.º 33
0
import huffman
from utils.functions import *
data = read_from_origin()
sdata, _ = split_data(data)
count_ = {}
for i in range(len(sdata)):
    if sdata[i] not in count_:
        count_[sdata[i]] = 1
    else:
        count_[sdata[i]] = count_[sdata[i]] + 1

codebook_ = convert_to_codebook(count_)

print(codebook_)

dic = huffman.codebook(codebook_)

str_ = ""

for i in range(len(sdata)):
    str_ = str_ + dic[sdata[i]]
print(len(str_))
print(str_)
f = open("huffman.txt", "w+")
f.write(str_)