Beispiel #1
0
    def train_model(self, text_list):
        if self.huffman is None:
            if self.word_dict is None:
                wc = WordCounter(text_list)
                self.__generate_word_dict(wc.count_res.larger_than(5))
                self.cutted_text_list = wc.text_list
            self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len)
        print('word_dict and huffman tree already generated')

        before = (self.win_len - 1) >> 1
        after = self.win_len - 1 - before

        if self.model == 'cbow':
            method = self.__deal_gram_cbow
        else:
            method = self.__deal_gram_skipgram

        if self.cutted_text_list:
            total = len(self.cutted_text_list)
            count = 0

            for line in self.cutted_text_list:
                line_len = len(line)
                for i in range(line_len):
                    method(line[i], line[max(0, i-before):i] +
                           line[i+1: min(line_len, i+after+1)])
        else:
            for line in text_list:
                line = list(jieba.cut(line, cut_all=False))
                line_len = len(line_len)
                for i in range(line_len):
                    method(line[i], line[max(0, i-before): i] +
                           line[i+1, min(line_len, i+after+1)])
        print('word vector has been generated')
Beispiel #2
0
def create_huffman_tree(occurrences):
    '''
    Return a Huffman tree of the symbols given in `occurrences`.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: Return a single Huffman tree (obtained with Huffman algorithm)\
    of the symbols in `occurrences`.
    :rtype: huffman_tree.HuffmanTre
    :Examples:
    
    >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2})
    |bca:7|_<|bc:3|_<|b:1|, |c:2|>, |a:4|>
    >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2})
    |cab:4|_<|c:2|, |ab:2|_<|a:1|, |b:1|>>
    '''
    symbol_list = create_forest(occurrences)
    tree_list = []

    while len(tree_list) + len(symbol_list) != 1:
        (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\
                          pop_least_element(symbol_list, tree_list))
        new_tree = HuffmanTree(left=elem1, right=elem2)
        tree_list.append(new_tree)

    if len(tree_list) == 1:
        return tree_list[0]
    return symbol_list[0]
Beispiel #3
0
def create_huffman_tree(occurrences):
    '''
    Return a Huffman tree of the symbols given in `occurrences`.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: Return a single Huffman tree (obtained with Huffman algorithm)\
    of the symbols in `occurrences`.
    :rtype: huffman_tree.HuffmanTre
    :Examples:
    
    >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE
       /--▮ 'b':1
    /--◯ 'a, b':2
    |  \--▮ 'a':1
    ◯ 'c, a, b':4
    \--▮ 'c':2
    <BLANKLINE>
    >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE
    /--▮ 'a':4
    ◯ 'b, c, a':7
    |  /--▮ 'c':2
    \--◯ 'b, c':3
       \--▮ 'b':1
    <BLANKLINE>
    >>> create_huffman_tree({97: 4, 98: 1, 99: 2}) # doctest: +NORMALIZE_WHITESPACE
    /--▮ 97:4
    ◯ '98, 99, 97':7
    |  /--▮ 99:2
    \--◯ '98, 99':3
       \--▮ 98:1
    <BLANKLINE>
    '''
    symbol_list = create_forest(occurrences)
    tree_list = []

    while len(tree_list) + len(symbol_list) != 1:
        (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\
                          pop_least_element(symbol_list, tree_list))
        new_tree = HuffmanTree(left=elem1, right=elem2)
        tree_list.append(new_tree)

    if len(tree_list) == 1:
        return tree_list[0]
    return symbol_list[0]
Beispiel #4
0
 def __init__(self, input_file_name, min_count):
     self.input_file_name = input_file_name
     self.input_file = open(self.input_file_name, encoding='utf-8')  # 数据文件
     self.min_count = min_count  # 要淘汰的低频数据的频度
     self.wordId_frequency_dict = dict()  # 词id-出现次数 dict
     self.word_count = 0  # 单词数(重复的词只算1个)
     self.word_count_sum = 0  # 单词总数 (重复的词 次数也累加)
     self.id2word_dict = dict()  # 词id-词 dict
     self.word2id_dict = dict()  # 词-词id dict
     self.word_sequence = list()
     self._init_dict()  # 初始化字典
     self.huffman_tree = HuffmanTree(self.wordId_frequency_dict)  # 霍夫曼树
     self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path(
     )
     self.word_pairs_queue = deque()
     # 结果展示
     print('Word Count is:', self.word_count)
     print('Word Count Sum is', self.word_count_sum)
    def generate_codes(self) -> Dict[int, Deque[bool]]:
        """
        Reads the whole input file and generates coding_table by the Huffman tree

        :return: coding table where key is byte and value is deque of bits
        """
        self.input_file.seek(0)
        freq_table = {}
        while True:
            input_buffer = self.input_file.read(INPUT_BUFFER_SIZE)
            if not input_buffer:
                break
            for byte in input_buffer:
                if byte in freq_table:
                    freq_table[byte] += 1
                else:
                    freq_table[byte] = 1
        tree = HuffmanTree(freq_table)
        return tree.generate_codes()
Beispiel #6
0
def create_forest(occurrences):
    '''
    Create the initial list of Huffman trees based on the dictionary of
    symbols given in parameter.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: A list sorted in ascending order on the number of occurrences\
    and on the symbols of Huffman trees of all symbols provided in\
    `occurrences`.
    :Examples: 

    >>> create_forest({'a': 4, 'c': 2, 'b': 1})
    [|b:1|, |c:2|, |a:4|]
    >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2})
    [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|]
    '''
    sorted_occs = sorted(occurrences.items(), key=lambda item: (item[1], item[0]))
    forest = [HuffmanTree(chr(leaf[0][0]),leaf[1]) for leaf in sorted_occs]
    return forest
def create_forest(occurrences):
    '''
    Create the initial list of Huffman trees based on the dictionary of
    symbols given in parameter.
    
    :param occurrences: Number of occurrences of each symbol.
    :type occurrences: dict
    :return: A list sorted in ascending order on the number of occurrences\
    and on the symbols of Huffman trees of all symbols provided in\
    `occurrences`.
    :Examples: 

    >>> create_forest({'a': 4, 'c': 2, 'b': 1})
    [|b:1|, |c:2|, |a:4|]
    >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2})
    [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|]
    '''
    #key=lambda item: (item[1], item[0]) permet de trier par rapport à l'items 1 du dictionnaire
    #sinon le trie va se faire par ordre lexicographique (a,b,c,...,z)
    sorted_occs = sorted(occurrences.items(),
                         key=lambda item: (item[1], item[0]))
    forest = [HuffmanTree(leaf[0], leaf[1]) for leaf in sorted_occs]
    return forest
Beispiel #8
0
    def generate_codes(self) -> Dict[int, Deque[bool]]:
        """
        Reads the whole input file and generates coding_table by the Huffman tree

        :return: coding table where key is byte and value is deque of bits
        """
        self.input_file.seek(0)  # переключаемся на первый байт входного файла
        freq_table = {
        }  # создаём пустую мапу частот, в котоорую в дальнейшем будем её записывать
        while True:
            input_buffer = self.input_file.read(
                INPUT_BUFFER_SIZE
            )  # считываем и помещаем данные из инпут файла в буффер (в память)
            if not input_buffer:  # если ничего не считалось, т.е. файл пустой - выходим
                break
            for byte in input_buffer:  # если считались данные, перебираем каждый байт этого файла
                if byte in freq_table:  # если данный байт уже есть в таблице частот, инкрементируем его значение
                    freq_table[byte] += 1
                else:  # если данного байта ещё нет в таблице частот, значит нужно его добавить и присвоить 1, как количество повторений
                    freq_table[byte] = 1
        tree = HuffmanTree(
            freq_table
        )  # строим дерево после формирования таблицы частот (внутри простроится дерево, и вернем коды которые получатся для каждого символа)
        return tree.generate_codes()  # возвращаем построенные коды
Beispiel #9
0
 def __init__(self, content):
     self.header = content[:4096]
     self.content = content[4096:]
     self.get_frequencies()
     self.tree = HuffmanTree(self.frequencies)
Beispiel #10
0
    freq = {}
    for char in input_string:
        if char not in freq:
            freq[char] = 1
        else:
            freq[char] += 1
    return freq


input_string = open("input.txt", "r").read()
probability = construct_probability(input_string)
encoding_array = {}
if len(probability) == 1:
    encoding_array.update({list(probability)[0]: '0'})
else:
    tree = HuffmanTree()
    tree.get_nodes_heap(probability)
    tree.construct_tree()
    encoding_array = tree.get_codes()

# Encode
output_string = ""
for char in input_string:
    output_string += encoding_array[char]
ouput = open('binary_output.bin', 'w')
ouput.write(output_string)
ouput.close()

# Decode
decoder = {}
for key in encoding_array:
Beispiel #11
0
 def build_tree(self):
     self.tree = HuffmanTree(self.frequencies)