Example #1
0
def build_codec(freq_list):
    """
    Purpose:
        Build a dictionary containing character:code pairs from the frequency list.
    Pre-conditions:
        :param freq_list: A list of (frequency,character) pairs.
    Return:
        :return: a dictionary
    """
    def get_frequency(a_HuffmanTree):
        """
        helper function for sorting the list according to frequency of the char
        :param a_HuffmanTree:
        :return:
        """
        return a_HuffmanTree.get_freq()

    leafs = [HT.HuffmanTree(freq=f, char=c) for c,f in freq_list]
    leafs.sort(key=get_frequency)
    heap = HP.HuffmanHeap(leafs, [])
    while len(heap.old) != 0:
        temp1 = heap.dequeue()
        temp2 = heap.dequeue()
        item3 = HT.HuffmanTree(temp1.get_freq()+temp2.get_freq(),left=temp1,right=temp2)
        heap.enqueue(item3)
    if len(heap.new) >= 2:
        while True:
            temp1 = heap.dequeue()
            temp2 = heap.dequeue()
            item3 = HT.HuffmanTree(temp1.get_freq() + temp2.get_freq(), left=temp1, right=temp2)
            if len(heap.new) == 0:
                heap.enqueue(item3)
                break
            heap.enqueue(item3)
    return heap.new[0].build_codec()
Example #2
0
def ec_dc(input):
    '''
    : ec_dc: DC系数的熵编码模块
    : param input: list[tuple(int, str)], 各个8*8图像的DC系数经过DPCM编码得到的输入元组序列
    : return: (dict{int, str}, list[tuple(str, str)]), 该元组实际为(哈夫曼编码的编码字典,哈夫曼编码结果)
    : note: 对于DC系数,使用静态哈夫曼编码编码输入元组序列的第一个整数,即位串长度size;而实际DC系数值的二进制位串无需进行编码
    '''
    num_lis = []  # 输入的数字去重集合
    num_freq = []  # 对应数字的出现频率
    for x in input:
        if x[0] in num_lis:
            pass
        else:
            num_lis.append(x[0])
            temp = 0
            for y in input:
                if y[0] == x[0]:
                    temp += 1
            num_freq.append(temp)
    hftree = hf.build_huffman_tree(num_lis, num_freq)  # 创建哈夫曼树
    dic = hf.generate_huffman_code(hftree)  # 从哈夫曼树生成哈夫曼编码
    res = []
    for x in input:
        res.append((dic[x[0]], x[1]))
    return (dic, res)
Example #3
0
def build_codec(freq_list):
    """
    Purpose:
        Build a dictionary containing character:code pairs from 
        the frequency list.
    Pre-conditions:
        :param freq_list: A list of (character,frequency) pairs.
    Return:
        :return: a dictionary
    """
    # sort the frequency list
    freq_list.sort(key=lambda p: p[1])

    # create the queue of Huffman trees
    # note: a new ADT for this purpose!
    hq = HH.HuffmanHeap([HT.HuffmanTree(freq=f, char=c) for c, f in freq_list])

    # dequeue 2 trees, combine them, and enqueue the resulting tree
    while len(hq) > 1:
        t1 = hq.dequeue()
        t2 = hq.dequeue()
        hq.enqueue(HT.HuffmanTree(left=t1, right=t2))

    #build a codec from the only tree that's left
    survivor = hq.dequeue()
    return survivor.build_codec()
Example #4
0
def getHuffmanCode(src):

    rootNode = Huffman.createTree(src)
    code = Huffman.getRelatedCodes(rootNode, src)
    code.sort(key=lambda t: len(t[2]))
    meanLen = computeMeanLength(code)
    onlyCodes = getOnlyCodes(code)
    return (onlyCodes, meanLen)
Example #5
0
def main():
    txtin = getText()
    huffman = Huffman(txtin)
    huffman.setFrequency()
    huffman.setTree()
    huffman.setCodes()
    huffman.encode()
    store(huffman.result, huffman.frequency, huffman.probability,
          huffman.codes)
Example #6
0
 def get_encodeStr(self):
     if not self.chars_freqs:
         self.chars_freqs = Huffman.cal_count_freq(self.get_content())
     if not self.Huffman_codes:
         self.Huffman_codes = Huffman.cal_Huffman_codes(self.chars_freqs)
     huffmanStr = ''
     for char in self.get_content():
         i = 0
         for item in self.chars_freqs:
             if char == item[0]:
                 huffmanStr += self.Huffman_codes[i]
             i += 1
     return huffmanStr
Example #7
0
def traiterFichier(bitsDuFichier, nomFichier, bavard):
    """
    Fait appel aux fonctions Huffman (dans Huffman.py) pour déterminer la structure de l’arbre, séparer les bit arbre des bits caractères des bits de texte et récupérer le texte, puis appelle la fonction enregistrerFichier
    :param bitsDuFichier: liste de bits
    :return: 
    """
    listeBits, listeCaracs = separerCarac(
        bitsDuFichier
    )  # bitsDuFichier ne contient plus que arbre + texte apres cette ligne

    nombreDeFeuilles = 0

    def reconArbre(arbre, listeBits):
        nonlocal nombreDeFeuilles
        if (listeBits[0] == 1):  # Si on est sur une feuille
            if (nombreDeFeuilles != listeCaracs[0]):
                if (
                        nombreDeFeuilles >= listeCaracs[0]
                ):  # Pour ne pas compter le carac de fin de fichier qui decale
                    carac = listeCaracs[nombreDeFeuilles]
                else:
                    carac = listeCaracs[nombreDeFeuilles + 1]
            else:
                carac = -1  # Caractere de fin de fichier
            nombreDeFeuilles += 1
            return [0, carac]  # On retourne la feuille
        else:
            k = 0  # Cette partie permet de détecter à quel endroit est la branche droite. Pour cela, elle compte le nombre de sous branches (0) et le nombre de caracteres (1) jusqu'a qu'ils s'egalisent. La fin de la branche est donc au k ieme bit
            count = [0, 0]
            for i in listeBits:
                k += 1
                if (i == 0):
                    count[0] += 1
                else:
                    count[1] += 1
                if (count[0] == count[1]):
                    break

            return [
                0,
                reconArbre(arbre, listeBits[1:]),
                reconArbre(arbre, listeBits[k:])
            ]  # On retourne le noeud

    arbre = reconArbre([], listeBits)
    if (bavard):
        print(Huffman.afficher_arbre(arbre, [1], "(" + str(arbre[0]) + ")"))
    donnees = Huffman.convertirHuffman(
        bitsDuFichier[2 * len(listeCaracs) - 1:], arbre)
    enregistrerFichier(donnees, nomFichier)
Example #8
0
 def get_decodeStr(self, huffmanStr):
     if not self.chars_freqs:
         self.chars_freqs = Huffman.cal_count_freq(self.get_content())
     if not self.Huffman_codes:
         self.Huffman_codes = Huffman.cal_Huffman_codes(self.chars_freqs)
     orignStr = ''
     while huffmanStr != '':
         i = 0
         for item in self.Huffman_codes:
             if item in huffmanStr:
                 if huffmanStr.index(item) == 0:
                     orignStr += self.chars_freqs[i][0]
                     huffmanStr = huffmanStr[len(item):]
             i += 1
     return orignStr
Example #9
0
 def get_huffman_codes(self):
     codes = []
     words = []
     if not self.chars_freqs:
         self.chars_freqs = Huffman.cal_count_freq(self.get_content())
     if not self.Huffman_codes:
         self.Huffman_codes = Huffman.cal_Huffman_codes(self.chars_freqs)
     for char in self.get_content():
         i = 0
         for item in self.chars_freqs:
             if char == item[0] and char not in words:
                 codes.append((char, self.Huffman_codes[i]))
                 words.append(char)
             i += 1
     return codes
def get_file_data_p(project_name, path, set_file, dict_file_label, dict_file_hand_craft, gv=global_var):
    """
    获取一个项目的标签,手工标注特征,完整类名,word2vec表示
    :param gv:
    :param dict_file_hand_craft
    :param project_name 项目源文件
    :param path:
    :param set_file:
    :param dict_file_label:
    :return:[[label,[hand_craft_data],full_class_name,[...],[...],]]
    """
    gv.load_word2vec(project_name)
    gv.load_token_vec_length(project_name)
    method_name = 'get_file_data_p'
    cache_name = '%s_%s_%d' % (path, method_name, gv.w2v_cnn_params['vec_size'])
    result = gv.load_cache(cache_name)
    if result is not None:
        logging.info('load cache success in %s' % cache_name)
        return result
    result = []
    for root, dirs, files in os.walk(path):
        for file_name in files:
            full_class_name = hf.get_full_class_name(root, file_name)
            if full_class_name in set_file:
                file_obj = open(os.path.join(root, file_name), 'r', encoding='utf-8')
                input_data = []  # 一个文件的embedding后的token_vec [[],[],...,label]
                ('processing file %s in get_file_data_p' % os.path.join(root, file_name))
                try:
                    ast_tree = jl.parse.parse(file_obj.read())
                    for path, node in ast_tree:
                        node_name = hf.get_node_name(node)
                        if node_name is not None:
                            input_data.append(gv.word_to_vec[node_name].cpu().numpy().tolist())
                    # padding(input_data, gv.params['token_vec_length'])
                    input_data.insert(0, dict_file_hand_craft[full_class_name])
                    input_data.insert(0, full_class_name)
                    input_data.insert(0, dict_file_label[full_class_name])
                    result.append(input_data)
                except jl.parser.JavaSyntaxError:
                    logging.error('parse file %s error' % os.path.join(root, file_name))
                except AttributeError:
                    logging.error('parse file %s attribute error in get_file_data_p ' % os.path.join(root, file_name))
                except UnicodeDecodeError:
                    logging.error('parse file %s unicode decode error' % os.path.join(root, file_name))
                finally:
                    file_obj.close()
    gv.dump_cache(cache_name, result)
    return result
Example #11
0
def ouvrirFichier(arguments):
    """
    Ouvre le fichier si il est présent et vérifie que son extension est valide. Exemple extension invalide : .hff
    Erreur si il n’est pas valide, sinon appelle la fonction traiterFichier
    :param nomFichier:
    :return:
    """
    if (len(arguments) == 0):
        print("Pas d'arguments donnes. Le programme ne s'executera pas")
        return EnvironmentError
    bavard = False
    if (arguments[0] == "-b" or arguments[0] == "--bavard"
        ):  # On verifie l'argument en première position, qui peut etre bavard
        nomFichier = arguments[1]
        bavard = True
    else:
        nomFichier = arguments[0]
    try:
        fichier = open(nomFichier, 'rb')
    except IOError:
        print("Fichier invalide")
        return FileNotFoundError
    if (len(nomFichier) >= 4 and nomFichier[len(nomFichier) - 4:] == ".hff"):
        print("Ce fichier est deja compresse !")
        return ValueError

    texteInBytes, dictCarac = Huffman.listerCarac(fichier)
    if (len(texteInBytes) <=
            1):  # Si le fichier est vide ou contient un seul caractere
        print("Fichier vide ou trop petit. Le programme ne s'executera pas")
        return
    traiterFichier(dictCarac, texteInBytes, nomFichier, bavard)
 def load_posting_list_parts(self, stem):
     offset, size = self.seek_list[stem][0]
     self.index_file.seek(offset)
     binary_data = self.index_file.read(size)
     decoded_posting_list = Huffman.decode(
         binary_data, self.symbol_to_encoding_dict)
     return [stem] + decoded_posting_list.split(posting_list_separator)
Example #13
0
def ec_ac(input):
    '''
    : ec_ac: AC系数的熵编码模块
    : param input: list[tuple(int, int)], 单个8*8图像的AC系数经过RLC编码得到的输入元组序列,其中元组的第一个整数为游长,第二个整数为游长后遇到的第一个值
    : return: (dict{tuple(int, int), str}, list[tuple(str, str)]), 其中的字典为(游长, 二进制值位数)的哈夫曼编码映射表,list为编码结果
    '''
    for i in range(len(input)):  # 将输入元组列表中元组的第二个值使用二进制编码
        input[i] = (input[i][0], binarize(input[i][1])
                    )  # input[i]=tuple(int, (int, str))=(游长, (二进制值位数, 二进制值表示))
        input[i] = (
            (input[i][0], input[i][1][0]), input[i][1][1]
        )  # input[i]=tuple((int, int), str))=((游长, 二进制值位数), 二进制值表示))

    extend_input = []
    for x in input:
        if x[0][0] > 15:  # 游长大于15,则分拆为一系列的(15, 0),使得每个元组的游长均小于等于15
            for i in range(ma.floor(x[0][0] / 15)):
                extend_input.append(((15, 0), ''))
            extend_input.append(((x[0][0] % 15, x[0][1]), x[1]))
        else:
            extend_input.append(x)

    num_lis = []  # 输入的(游长, 二进制值位数)去重集合
    num_freq = []  # 对应输入的出现频率
    for x in extend_input:
        if x[0] in num_lis:
            pass
        else:
            num_lis.append(x[0])
            temp = 0
            for y in extend_input:
                if y[0] == x[0]:
                    temp += 1
            num_freq.append(temp)

    hftree = hf.build_huffman_tree(num_lis, num_freq)  # 创建哈夫曼树
    dic = hf.generate_huffman_code(hftree)  # 从哈夫曼树生成哈夫曼编码

    for i in range(len(extend_input)):
        extend_input[i] = (dic[extend_input[i][0]], extend_input[i][1])

    return (dic, extend_input)
Example #14
0
class TestHuffman(unittest.TestCase):
    def setUp(self):
        self.huffman = Huffman()
        self.phrase = 'The quick brown fox jumped over the lazy dog. Filler text to create a bigger shift in the character probabilities.'
        self.huffman.build(self.phrase)

    def testPhrase(self):
        self.huffman.build(self.phrase)
        enc = self.huffman.encode(self.phrase)
        self.assertEqual(self.phrase, self.huffman.decode(enc))
Example #15
0
class TestHuffman(unittest.TestCase):

    def setUp(self):
        self.huffman = Huffman()
        self.phrase = 'The quick brown fox jumped over the lazy dog. Filler text to create a bigger shift in the character probabilities.'
        self.huffman.build(self.phrase)

    def testPhrase(self):
        self.huffman.build(self.phrase)
        enc = self.huffman.encode(self.phrase)
        self.assertEqual(self.phrase, self.huffman.decode(enc))
Example #16
0
def traiterFichier(dictCarac, texteInBytes, nomFichier, bavard):
    arbre = Huffman.creerArbre(dictCarac)
    table = Huffman.etablirTable(arbre)

    strBitsArbre = (
        Huffman.arbreVersBits(arbre)
    )  # Recupere structure infixe de l'abre en string de 0 et 1

    strBitsTexte = Huffman.convertirTexte(texteInBytes, table)
    strBitsTexte += str(table[-1])  # On ajoute le carac de fin de texte

    strBitsArbreEtTexte = strBitsArbre + strBitsTexte

    bytesArbreEtTexte = []
    for i in range(
            len(strBitsArbreEtTexte) //
            8):  # Cette boucle convertit string de 0 et 1 vers liste de bytes
        value = ""
        for k in range(8):  # On recupere les 8 bits suivants
            if (8 * i + k < len(strBitsArbreEtTexte)
                ):  # Si on ne depasse pas le nombre de bits
                value += strBitsArbreEtTexte[8 * i + k]
            else:  # On complete avec des 0 pour avoir des bytes entiers
                value += "0"
        bytesArbreEtTexte.append(int(value, 2))  # On convertit en bytes
    listeBytesCarac = []
    listeCaracTemporaire = []

    k = 0
    for carac, bits in table.items(
    ):  # On fait passer le rang du carac de fin de fichier en premier

        if (carac == -1):  # On ajoute le rang du caractère de fin de fichier
            listeBytesCarac.append(k)
        else:  # Dans une autre liste on ajoute à la suite les caractères
            listeCaracTemporaire.append(carac)
        k += 1
    listeBytesCarac += listeCaracTemporaire  # On a ici rang du caractere de fin de fichier puis liste des autres caracteres
    listeBytesCarac.append(
        listeBytesCarac[-1])  # ON double le dernier caractere

    if (bavard):
        print(
            Huffman.afficher_arbre(
                arbre, [1], "##### ARBRE #####\n(" + str(arbre[0]) + ")"))
        Huffman.afficher_table(table)

    enregistrerFichier(listeBytesCarac + bytesArbreEtTexte, nomFichier)
Example #17
0
def testsHuffman():
    print("### Tests Huffman ###")
    assert Huffman.creerArbre(dict()) == [0, -1]  # Arbre poids 0, caractere de fin de fichier seulement
    assert Huffman.creerArbre({100: 3, 101: 1}) == [4, [1, [0, -1], [1, 101]], [3, 100]]
    assert Huffman.creerArbre({100: 0}) == [0, [0, 100], [0, -1]]

    assert Huffman.remonterAuNoeudNonExplore([1, 1, 2, 2, 2]) == [
        1]  # Doit remonter juste avant la derniere fois ou on a prit a gauche. 1 signifie gauche, 2 droite
    assert Huffman.remonterAuNoeudNonExplore([1]) == []

    assert Huffman.arbreVersBits([4, [1, [0, -1], [1, 101]], [3, 100]]) == "00111"

    print("### Huffman - OK ###")
Example #18
0
 def setUp(self):
     self.huffman = Huffman()
     self.phrase = 'The quick brown fox jumped over the lazy dog. Filler text to create a bigger shift in the character probabilities.'
     self.huffman.build(self.phrase)
Example #19
0
        fin = open(input_file_name, "r")
        raw_file = fin.read()
        fin.close()
        # fake transmission
        message_out = str()
        for i in raw_file:
            if random() > Z[int(i)][0]:
                message_out = message_out + "1"
            else:
                message_out = message_out + "0"
        # check, not necessary
        # print message_out
        print "[*]Length after transport:", len(message_out)
        # write output-file
        f = open(output_file_name, "w+")
        f.write(message_out)
        f.close()

    except Exception, e:
        raise e


#test if function is correct
if __name__ == "__main__":
    transmission("hamming_out.txt", "channel_out.txt")
    import Hamming, Huffman
    # Hamming is decoding ;D
    Hamming.decode("hamming_table_out.txt", "channel_out.txt", "hamming_decode_out_from_channel.txt")
    # Huffman is decoding XD
    Huffman.decode("huffman_table_out.txt", "hamming_decode_out_from_channel.txt", "huffman_decode_out_from_channel.txt")
Example #20
0
 def ejecutarHM(self, texto=""):
     codificador = hm.Huffman()
     resultado = codificador.codificar(texto)
     return resultado
Example #21
0
def jpgPipeline(im, Qmat):
    # conversion to YCbCr
    immod = rgb2ycbcr.rgb2ycbcr(im)
    plt.figure()
    plt.imshow(immod, cmap=plt.get_cmap('gray'))
    plt.draw()

    # separation of each channel
    Y = immod[:,:,0]
    Cb = immod[:,:,1]
    Cr = immod[:,:,2]

    #subsampling of Cb and Cr
    Cbsub = Cb[0::2, 0::2]
    Crsub = Cr[0::2, 0::2]

    # Split into 8x8 regions
    Ysplit = imgsplit.imgsplit(Y)
    Cbsplit = imgsplit.imgsplit(Cbsub)
    Crsplit = imgsplit.imgsplit(Crsub)

    # DCT
    YDCT = DCT.dctloop(Ysplit)
    CbDCT = DCT.dctloop(Cbsplit)
    CrDCT = DCT.dctloop(Crsplit)

    # quantification
    Yq = quantification.quantify(YDCT, Qmat)
    Cbq = quantification.quantify(CbDCT, Qmat)
    Crq = quantification.quantify(CrDCT, Qmat)

    #zigzag
    Yz = zigzag.zigzag(Yq)
    Cbz = zigzag.zigzag(Cbq)
    Crz = zigzag.zigzag(Crq)

    # Huffman
    YH = Huffman.huffman_encoding(Yz)
    CbH = Huffman.huffman_encoding(Cbz)
    CrH = Huffman.huffman_encoding(Crz)

    # size calculation
    finalSize = YH.getSize()
    finalSize += CbH.getSize()
    finalSize += CrH.getSize()

    # convert back

    # Huffman decoding
    # skipped, as we were told we could skip it


    #reverse zigzag
    Yq = zigzag.zagzig(Yz)
    Cbq = zigzag.zagzig(Cbz)
    Crq = zigzag.zagzig(Crz)

    # quantification inverse
    YDCT = quantification.dequantify(Yq, Qmat)
    CbDCT = quantification.dequantify(Cbq, Qmat)
    CrDCT = quantification.dequantify(Crq, Qmat)

    # inverse DCT
    Ysplit = DCT.idctloop(YDCT)
    Cbsplit = DCT.idctloop(CbDCT)
    Crsplit = DCT.idctloop(CrDCT)

    # recombine the 8x8 regions
    Yunsplit = imgsplit.imgunsplit(Ysplit)
    Cbunsplit = imgsplit.imgunsplit(Cbsplit)
    Crunsplit = imgsplit.imgunsplit(Crsplit)

    # unsubsampling of Cb and Cr
    Cbunsubx = np.repeat(Cbunsplit, 2, axis=0)
    Cbunsub = np.repeat(Cbunsubx, 2, axis=1)
    Crunsubx = np.repeat(Crunsplit, 2, axis=0)
    Crunsub = np.repeat(Crunsubx, 2, axis=1)

    im_recombine = np.dstack((Yunsplit, Cbunsub, Crunsub))

    # convert back to RGB
    imfinal = rgb2ycbcr.ycbcr2rgb(im_recombine)
    plt.figure()
    plt.imshow(imfinal, cmap=plt.get_cmap('gray'))
    plt.draw()
    plt.show()

    compression = 1-(float(finalSize)/float(originalSize))

    print "original size: " + str(originalSize)
    print "final size: " + str(finalSize)
    print "compression rate: " + str(compression)

    return imfinal
Example #22
0
def make_huffman_tree():
    return Huffman.huffman(queue)
        Decodes a single audio channel of data based on the values of its scale factors,
        bit allocations, quantized mantissas, and overall scale factor.
        """
        #Passes decoding logic to the Decode function defined in the codec module
        return codec.Decode(scaleFactor,bitAlloc,mantissa, overallScaleFactor,codingParams,LRMS)

#-----------------------------------------------------------------------------

# Testing the full PAC coder
if __name__=="__main__":

    import time
    from pcmfile import * # to get access to WAV file handling

    '''==============Huffman coder init====================='''
    huffman = Huffman()
    '''====================================================='''

    test = [True, False, False, False, False]

    coded_filename = "../coded/coded.wak"

    if test[0]:
        input_filename = "../inputs/castanets.wav"
        output_filename = "../outputs/castanets.wav"

    if test[1]:
        input_filename = "../inputs/trumpet.wav"
        output_filename = "../outputs/trumpet.wav"

    if test[2]:
import Huffman as hf
import HuffmanDecode as hfd

teststr = [
    1, 1, 1, 1, 1, 1, 1, 3, 4, 5, 1, 3, 4, 5, 1, 3, 4, 5, 1, 3, 4, 5, 1, 3, 4,
    5, 1, 3, 4, 5
]
(dict, dict_depth, sy) = hf.huffman(teststr)
strr = hfd.generateString(teststr, dict_depth, sy)
decoded = hfd.HuffmanDecode(strr, sy, dict_depth)
right = True
for i in range(len(decoded)):
    if decoded[i] != teststr[i]:
        print("error!\torigin=", end='\t')
        print(teststr[i], end='\t')
        print("error!\tdeocded", end='\t')
        print(decoded[i], end='\n')
        right = False

if right == True:
    for i in range(len(decoded)):
        print(decoded[i], end='\t')
        if i % 5 == 0:
            print()
Example #25
0
 def setUp(self):
     self.huffman = Huffman()
     self.phrase = 'The quick brown fox jumped over the lazy dog. Filler text to create a bigger shift in the character probabilities.'
     self.huffman.build(self.phrase)
Example #26
0
        bit allocations, quantized mantissas, and overall scale factor.
        """
        #Passes decoding logic to the Decode function defined in the codec module
        return codec.Decode(scaleFactor, bitAlloc, mantissa,
                            overallScaleFactor, codingParams, LRMS)


#-----------------------------------------------------------------------------

# Testing the full PAC coder
if __name__ == "__main__":

    import time
    from pcmfile import *  # to get access to WAV file handling
    '''==============Huffman coder init====================='''
    huffman = Huffman()
    '''====================================================='''

    test = [True, False, False, False, False]

    coded_filename = "../coded/coded.wak"

    if test[0]:
        input_filename = "../inputs/castanets.wav"
        output_filename = "../outputs/castanets.wav"

    if test[1]:
        input_filename = "../inputs/trumpet.wav"
        output_filename = "../outputs/trumpet.wav"

    if test[2]:
Example #27
0
# Initiates our recursive walk.
def in_order_walk_with_path(T):
    in_order_walk_with_helper("", T)


# Writes the occurence to output file.
def write_occurences_to_output():
    for occurence in occurences:
        bitstreamout.writeint32bits(occurence)

# Reads the inputfile again and for each byte we write its Huffman code to the output file. 
def writes_codes_to_output():
    inputfile = open(sys.argv[1], "rb")
    while True:
        byte = inputfile.read(1)
        if byte == b"":
            break
        else:
            for bit in codes[byte[0]]:
                bitstreamout.writebit(int(bit))
    bitstreamout.close()


read_file_occurences()
queue = create_priority_queue()
ele = Huffman.huffman(queue)
in_order_walk_with_path(ele)
write_occurences_to_output()
writes_codes_to_output()
    def huffman_compression(self, generate_encoding=False):
        # compress using Huffman encoding
        symbol_to_encoding_dict = {}

        # count all occuring UTF-8 characters
        if generate_encoding:
            symbol_to_frequency_dict = Counter()
            with self.report.measure('counting utf8 characters'):
                with open(f'{self.directory}/index.csv') as index_file:
                    chunk_size = 100000

                    def next_chunk_generator():
                        chunk = index_file.read(chunk_size)
                        while chunk:
                            yield chunk
                            chunk = index_file.read(chunk_size)

                    for i, chunk in enumerate(next_chunk_generator(), 1):
                        symbol_to_frequency_dict.update(Counter(chunk))
                        self.report.progress(
                            i, f' chunks counted ({chunk_size} characters '
                            'each)', 100)
                if '\n' in symbol_to_frequency_dict.keys():
                    del symbol_to_frequency_dict['\n']

            # derive huffman encoding from character counts
            with self.report.measure('deriving huffman encoding'):
                symbol_to_encoding_dict = Huffman.derive_encoding(
                    symbol_to_frequency_dict)
            for key, value in symbol_to_encoding_dict.items():
                assert (len(key) == 1)
                symbol_to_encoding_list[ord(key[0])] = value
            with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                      mode='wb') as f:
                pickle.dump(symbol_to_encoding_dict, f,
                            pickle.HIGHEST_PROTOCOL)
        else:
            # optimal encoding for guardian
            # character distribution should be similar for all datasets
            symbol_to_encoding_dict = {
                '\a': BitArray('1111'),
                ',': BitArray('001'),
                '0': BitArray('1000'),
                '1': BitArray('011'),
                '2': BitArray('010'),
                '3': BitArray('000'),
                '4': BitArray('1110'),
                '5': BitArray('1101'),
                '6': BitArray('1100'),
                '7': BitArray('1011'),
                '8': BitArray('1010'),
                '9': BitArray('1001')
            }

        with open(f'{self.directory}/symbol_to_encoding_dict.pickle',
                  mode='wb') as f:
            pickle.dump(symbol_to_encoding_dict, f, pickle.HIGHEST_PROTOCOL)

        # save compressed index and corresponding seek_list
        with self.report.measure('saving compressed files'):
            self.compressed_seek_list = []
            with open(f'{self.directory}/compressed_index', mode='wb') \
                    as compressed_index_file:
                offset = 0
                for i, orig_line in enumerate(
                        binary_read_line_generator_path(
                            f'{self.directory}/index.csv'), 1):
                    term = next(
                        csv.reader(io.StringIO(orig_line),
                                   delimiter=posting_list_separator))[0]
                    line_without_term = orig_line[len(term) + 3:]
                    encoded_line = Huffman.encode(line_without_term,
                                                  symbol_to_encoding_dict)
                    compressed_index_file.write(encoded_line)

                    self.compressed_seek_list.append(
                        (term, (offset, len(encoded_line))))

                    self.report.progress(i, ' index lines compressed', 100000)

                    offset += len(encoded_line)
            self.compressed_seek_list = \
                RecordDAWG('>QQ', self.compressed_seek_list)
            self.compressed_seek_list.save(
                f'{self.directory}/compressed_seek_list.dawg')
Example #29
0
# by incrementing the table in the position corresponding to the byte read
# The while loop continues as long as there are more bytes to read
byte = inFile.read(blockSize)
while byte != b'':
    table[byte[0]] += 1
    byte = inFile.read(blockSize)

# Input file is read, and the stream gets closed
bitstreamin.close()

# Writes the frequency table to the output file, as 32 bit integers
for i in range(tableSize):
    bitstreamout.writeint32bits(table[i])

# Here the huffman tree is populated from the frequency table
pq = Huffman.huffman(table)

# Recursive function for writing the Huffman codes of each byte to a list 'dictionary'
# It does an in-order traversal of the tree while keeping track of the path in the 'code'
# takes the arguments:
# 'e', the element to go from, initially the root of the tree
# 'code' an initially empty string, that it uses to keep track of the path down the tree
# 'dictionary' an initially empty list, where the codes are stored.


def populateDictionary(e, code, dictionary):

    if len(e.data) == 2:
        code += '0'
        populateDictionary(e.data[0], code, dictionary)
        code = code[:-1]
Example #30
0
    o = binascii.b2a_hex(gzip.compress(i))
    print(len(i))
    print(len(o))
    outfile.write(o)
    outfile.close()
    infile.close()

def a2b_fast(open_file):
    infile = open_file[0]
    outfile = open_file[1]
    i = infile.read()
    o = gzip.decompress(binascii.a2b_hex(i))
    outfile.write(o)
    outfile.close()
    infile.close()

def open_file(source_path,target_path):
    infile = open(source_path,"rb")
    outfile = open(target_path,"wb")
    return [infile,outfile]  
    
if __name__=="__main__":
    source_path = r'C:\Users\wiwang\Desktop\File\UDM-pic\test_case_1.png'
    target_path = r'C:\Users\wiwang\Desktop\est_case_1.txt'
    h = Huffman.Huffman()
    h.compress(source_path,target_path)
    #b2a_fast(open_file(source_path,target_path))
    
    #source_path = r'C:\Users\wiwang\Desktop\python-3.8.0.txt'
    #target_path = r'C:\Users\wiwang\Desktop\python-3.8.0.exe'
    #a2b_fast(open_file(source_path,target_path))
Example #31
0
import Huffman

if __name__ == "__main__":
    huf = Huffman.Huffman()
    huf.compress(filename="test.txt", save_as="ana", debug=True)
    huf.decompress(filename="ana", save_as="test2", debug=True)
Example #32
0
            code_table[line[5:-1]] = line[0:4]

        decoded_file = str()
        for i in range(len(raw_file)/7):
            mark = "".join(raw_file[7*i:7*i+7]) + "\n" + check("".join(raw_file[7*i:7*i+7]), error_flag)
            decoded_file = decoded_file + code_table[check("".join(raw_file[7*i:7*i+7]), error_flag)]
        # check, not necessary
        # print code_table
        # print decoded_file
        print "[*]Length after Hanmming decode:", len(decoded_file)
        # write output-file
        f = open(output_file_name, "w+")
        f.write(decoded_file)
        f.close()

    except Exception, e:
        print "==========================="
        print mark
        print "+++++++++++++++++++++++++++"
        raise e


#test if function is correct
if __name__ == "__main__":
    error_flag("error_flag.txt")
    make_code_table("hamming_table_out.txt")
    encode("hamming_table_out.txt", "huffman_out.txt", "hamming_out.txt")
    decode("hamming_table_out.txt", "hamming_out.txt", "hamming_decode_out.txt")
    import Huffman
    Huffman.decode("huffman_table_out.txt", "hamming_decode_out.txt", "huffman_decode_out.txt")
Example #33
0
# and the required size of the table is calculated from that
blockSize = 1
tableSize = 2**(blockSize * 8)

# Initialize frequency table as an empty list
table = list()
totalBlocks = 0

# Populate the table, from the huffman encoded file
# We keep track of the number of blocks from the original file
for i in range(tableSize):
    x = bitstreamin.readint32bits()
    table.append(x)
    totalBlocks += x

tree = Huffman.huffman(table)

# Initialize variables used in decoding
writtenBlocks = 0
position = tree[0]

# It starts from the root, which is at index 0 in the list
# it sums up the written blocks, to know when it's done
while writtenBlocks < totalBlocks:
    x = bitstreamin.readbit()
    if x == 0:
        position = position.data[0]
    else:
        position = position.data[1]
    if len(position.data) == 1:
        outFile.write(bytes([position.data[0]]))
    # Shannon Fano
    sf = ShannonFano.ShannonFano(name)
    encode_file_name_sf = 'encode/' + i + '.sf'

    start_sf = time.time()
    sf.encode(encode_file_name_sf)
    end_sf = time.time()
    run_time_sf = (end_sf - start_sf)

    before_size = os.path.getsize(name)
    after_size_sf = os.path.getsize(encode_file_name_sf)
    compression_ratio_sf = before_size / after_size_sf

    # Huffman Coding
    hc = Huffman.HuffmanCoding(name)
    encode_file_name_hc = 'encode/' + i + '.hc'

    start_hc = time.time()
    hc.compress(encode_file_name_hc)
    end_hc = time.time()
    run_time_hc = end_hc - start_hc

    after_size_hc = os.path.getsize(encode_file_name_hc)
    compression_ratio_hc = before_size / after_size_hc

    print("Shannon-Fano: ", i, sf.row, 'x', sf.col, compression_ratio_sf,
          run_time_sf, 's')
    print("Huffman Coding: ", i, hc.row, 'x', hc.col, compression_ratio_hc,
          run_time_hc, 's')
Example #35
0
from Huffman import *
import time
import os
import sys


def cls():
    """
    Clears screen depending on operating system
    :return: None
    """
    os.system('cls' if os.name == 'nt' else 'clear')


HH = Huffman()


def menu() -> None:
    """
    A menu system giving user the options
    :return:
    """
    options = input(
        f"Enter an option to begin:\n\n{1}. create codes\n{2}. encode a file\n{3}. decode a file\n\n{0}. to exit\n"
    )
    try:
        options = int(options)
    except:
        pass
        #logging.info("Not an integer")
    if (options == 1) or (options == 2) or (options == 4):
Example #36
0
# -*- coding:utf-8 -*-

import Huffman, Hamming, Channel

#test if function is correct
if __name__ == "__main__":
    # Huffman is encoding XD
    Huffman.make_code_table("frequency.txt", "huffman_table.out")
    Huffman.encode("huffman_table.out", "raw_file.txt", "huffman.out", 4)
    # Hamming is encoding ;D
    print "=====message with Hamming-coding====="
    Hamming.error_flag("error_flag.out")
    Hamming.make_code_table("hamming_table.out")
    Hamming.encode("hamming_table.out", "huffman.out", "hamming.out")
    # message is in the channel
    Channel.transmission("hamming.out", "channel.out")
    # Hamming is decoding ;D
    Hamming.decode("hamming_table.out", "channel.out", "hamming_decode_out_from_channel.out", "error_flag.out")
    # Huffman is decoding XD
    Huffman.decode("huffman_table.out", "hamming_decode_out_from_channel.out", "huffman_decode_out_from_channel.out")
    print "=====message *without* Hamming-coding====="
    # message is in the channel
    Channel.transmission("hamming.out", "channel.out")
    # Huffman is decoding XD
    Huffman.decode("huffman_table.out", "channel.out", "huffman_decode_out_from_channel2.out")
    print "=====Huffman====="
    Huffman.efficiency("huffman_table.out", "frequency.txt")