class HuffmanDecoder: # Assuming content comes as a huge string. def __init__(self, content): self.header = content[:4096] self.content = content[4096:] self.get_frequencies() self.tree = HuffmanTree(self.frequencies) def get_frequencies(self): gap = int(len(self.header) / 128) cnts = [ int(self.header[i:i + gap], 2) for i in range(0, len(self.header), gap) ] self.frequencies = {chr(x): cnts[x] for x in range(128) if cnts[x] > 0} def get_decoding(self): cur = '' decoded_list = [] for c in self.content: cur += c if self.tree.isDecodable(cur): decoded_list.append(self.tree.getChar(cur)) cur = '' return ''.join(decoded_list)
def create_huffman_tree(occurrences): ''' Return a Huffman tree of the symbols given in `occurrences`. :param occurrences: Number of occurrences of each symbol. :type occurrences: dict :return: Return a single Huffman tree (obtained with Huffman algorithm)\ of the symbols in `occurrences`. :rtype: huffman_tree.HuffmanTre :Examples: >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2}) |bca:7|_<|bc:3|_<|b:1|, |c:2|>, |a:4|> >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2}) |cab:4|_<|c:2|, |ab:2|_<|a:1|, |b:1|>> ''' symbol_list = create_forest(occurrences) tree_list = [] while len(tree_list) + len(symbol_list) != 1: (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\ pop_least_element(symbol_list, tree_list)) new_tree = HuffmanTree(left=elem1, right=elem2) tree_list.append(new_tree) if len(tree_list) == 1: return tree_list[0] return symbol_list[0]
def train_model(self, text_list): if self.huffman is None: if self.word_dict is None: wc = WordCounter(text_list) self.__generate_word_dict(wc.count_res.larger_than(5)) self.cutted_text_list = wc.text_list self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len) print('word_dict and huffman tree already generated') before = (self.win_len - 1) >> 1 after = self.win_len - 1 - before if self.model == 'cbow': method = self.__deal_gram_cbow else: method = self.__deal_gram_skipgram if self.cutted_text_list: total = len(self.cutted_text_list) count = 0 for line in self.cutted_text_list: line_len = len(line) for i in range(line_len): method(line[i], line[max(0, i-before):i] + line[i+1: min(line_len, i+after+1)]) else: for line in text_list: line = list(jieba.cut(line, cut_all=False)) line_len = len(line_len) for i in range(line_len): method(line[i], line[max(0, i-before): i] + line[i+1, min(line_len, i+after+1)]) print('word vector has been generated')
def __init__(self, input_file_name, min_count): self.input_file_name = input_file_name self.input_file = open(self.input_file_name, encoding='utf-8') # 数据文件 self.min_count = min_count # 要淘汰的低频数据的频度 self.wordId_frequency_dict = dict() # 词id-出现次数 dict self.word_count = 0 # 单词数(重复的词只算1个) self.word_count_sum = 0 # 单词总数 (重复的词 次数也累加) self.id2word_dict = dict() # 词id-词 dict self.word2id_dict = dict() # 词-词id dict self.word_sequence = list() self._init_dict() # 初始化字典 self.huffman_tree = HuffmanTree(self.wordId_frequency_dict) # 霍夫曼树 self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path( ) self.word_pairs_queue = deque() # 结果展示 print('Word Count is:', self.word_count) print('Word Count Sum is', self.word_count_sum)
def generate_codes(self) -> Dict[int, Deque[bool]]: """ Reads the whole input file and generates coding_table by the Huffman tree :return: coding table where key is byte and value is deque of bits """ self.input_file.seek(0) freq_table = {} while True: input_buffer = self.input_file.read(INPUT_BUFFER_SIZE) if not input_buffer: break for byte in input_buffer: if byte in freq_table: freq_table[byte] += 1 else: freq_table[byte] = 1 tree = HuffmanTree(freq_table) return tree.generate_codes()
class HuffmanEncoder: def __init__(self, content_source, type='PATH'): self.get_content(content_source, type) self.build_frequencies() self.build_tree() self.encode() self.build_header() def get_content(self, content_source, type): if type == 'PATH': f = open(content_source, 'r') self.content = f.read() f.close() else: self.content = content_source # TODO: Figure out what to do with characters outside of 128 ASCII self.content = ''.join( list(filter(lambda x: ord(x) < 128, self.content))) def build_frequencies(self): self.frequencies = {} for c in self.content: if c not in self.frequencies: self.frequencies[c] = 0 self.frequencies[c] += 1 def build_tree(self): self.tree = HuffmanTree(self.frequencies) def encode(self): enc_list = [self.tree.getCode(c) for c in self.content] self.encoding = ''.join(enc_list) def build_header(self): header_list = [] for i in range(128): v = 0 if chr(i) not in self.frequencies else self.frequencies[chr( i)] b = bin(v)[2:] to_append = '0' * (32 - len(b)) + b header_list.append(to_append) if chr(i) in self.frequencies: assert int(to_append, 2) == self.frequencies[chr(i)] self.header = ''.join(header_list) # Prints the encoding as a binary string. def get_encoding(self, header=True): if header: return self.header + self.encoding else: return self.encoding
def generate_codes(self) -> Dict[int, Deque[bool]]: """ Reads the whole input file and generates coding_table by the Huffman tree :return: coding table where key is byte and value is deque of bits """ self.input_file.seek(0) # переключаемся на первый байт входного файла freq_table = { } # создаём пустую мапу частот, в котоорую в дальнейшем будем её записывать while True: input_buffer = self.input_file.read( INPUT_BUFFER_SIZE ) # считываем и помещаем данные из инпут файла в буффер (в память) if not input_buffer: # если ничего не считалось, т.е. файл пустой - выходим break for byte in input_buffer: # если считались данные, перебираем каждый байт этого файла if byte in freq_table: # если данный байт уже есть в таблице частот, инкрементируем его значение freq_table[byte] += 1 else: # если данного байта ещё нет в таблице частот, значит нужно его добавить и присвоить 1, как количество повторений freq_table[byte] = 1 tree = HuffmanTree( freq_table ) # строим дерево после формирования таблицы частот (внутри простроится дерево, и вернем коды которые получатся для каждого символа) return tree.generate_codes() # возвращаем построенные коды
def create_huffman_tree(occurrences): ''' Return a Huffman tree of the symbols given in `occurrences`. :param occurrences: Number of occurrences of each symbol. :type occurrences: dict :return: Return a single Huffman tree (obtained with Huffman algorithm)\ of the symbols in `occurrences`. :rtype: huffman_tree.HuffmanTre :Examples: >>> create_huffman_tree({'a': 1, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE /--▮ 'b':1 /--◯ 'a, b':2 | \--▮ 'a':1 ◯ 'c, a, b':4 \--▮ 'c':2 <BLANKLINE> >>> create_huffman_tree({'a': 4, 'b': 1, 'c': 2}) # doctest: +NORMALIZE_WHITESPACE /--▮ 'a':4 ◯ 'b, c, a':7 | /--▮ 'c':2 \--◯ 'b, c':3 \--▮ 'b':1 <BLANKLINE> >>> create_huffman_tree({97: 4, 98: 1, 99: 2}) # doctest: +NORMALIZE_WHITESPACE /--▮ 97:4 ◯ '98, 99, 97':7 | /--▮ 99:2 \--◯ '98, 99':3 \--▮ 98:1 <BLANKLINE> ''' symbol_list = create_forest(occurrences) tree_list = [] while len(tree_list) + len(symbol_list) != 1: (elem1, elem2) = (pop_least_element(symbol_list, tree_list),\ pop_least_element(symbol_list, tree_list)) new_tree = HuffmanTree(left=elem1, right=elem2) tree_list.append(new_tree) if len(tree_list) == 1: return tree_list[0] return symbol_list[0]
def create_forest(occurrences): ''' Create the initial list of Huffman trees based on the dictionary of symbols given in parameter. :param occurrences: Number of occurrences of each symbol. :type occurrences: dict :return: A list sorted in ascending order on the number of occurrences\ and on the symbols of Huffman trees of all symbols provided in\ `occurrences`. :Examples: >>> create_forest({'a': 4, 'c': 2, 'b': 1}) [|b:1|, |c:2|, |a:4|] >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2}) [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|] ''' sorted_occs = sorted(occurrences.items(), key=lambda item: (item[1], item[0])) forest = [HuffmanTree(chr(leaf[0][0]),leaf[1]) for leaf in sorted_occs] return forest
def create_forest(occurrences): ''' Create the initial list of Huffman trees based on the dictionary of symbols given in parameter. :param occurrences: Number of occurrences of each symbol. :type occurrences: dict :return: A list sorted in ascending order on the number of occurrences\ and on the symbols of Huffman trees of all symbols provided in\ `occurrences`. :Examples: >>> create_forest({'a': 4, 'c': 2, 'b': 1}) [|b:1|, |c:2|, |a:4|] >>> create_forest({'e': 1, 'f': 1, 'g': 1, 'h': 1, 'a':2}) [|e:1|, |f:1|, |g:1|, |h:1|, |a:2|] ''' #key=lambda item: (item[1], item[0]) permet de trier par rapport à l'items 1 du dictionnaire #sinon le trie va se faire par ordre lexicographique (a,b,c,...,z) sorted_occs = sorted(occurrences.items(), key=lambda item: (item[1], item[0])) forest = [HuffmanTree(leaf[0], leaf[1]) for leaf in sorted_occs] return forest
class InputData: def __init__(self, input_file_name, min_count): self.input_file_name = input_file_name self.input_file = open(self.input_file_name) # 数据文件 self.index = 0 self.min_count = min_count # 要淘汰的低频数据的频度 self.wordId_frequency_dict = dict() # 词id-出现次数 dict self.word_count = 0 # 单词数(重复的词只算1个) self.word_count_sum = 0 # 单词总数 (重复的词 次数也累加) self.sentence_count = 0 # 句子数 self.id2word_dict = dict() # 词id-词 dict self.word2id_dict = dict() # 词-词id dict self._init_dict() # 初始化字典 self.get_wordId_list() self.huffman_tree = HuffmanTree(self.wordId_frequency_dict) # 霍夫曼树 self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path() self.word_pairs_queue = deque() # 结果展示 print('Word Count is:', self.word_count) print('Word Count Sum is', self.word_count_sum) print('Sentence Count is:', self.sentence_count) print('Tree Node is:', len(self.huffman_tree.huffman)) def _init_dict(self): word_freq = dict() # 统计 word_frequency for line in self.input_file: line = line.strip().split(' ') # 去首尾空格 self.word_count_sum += len(line) self.sentence_count += 1 for word in line: try: word_freq[word] += 1 except: word_freq[word] = 1 word_id = 0 # 初始化 word2id_dict,id2word_dict, wordId_frequency_dict字典 for per_word, per_count in word_freq.items(): if per_count < self.min_count: # 去除低频 self.word_count_sum -= per_count continue self.id2word_dict[word_id] = per_word self.word2id_dict[per_word] = word_id self.wordId_frequency_dict[word_id] = per_count word_id += 1 self.word_count = len(self.word2id_dict) # 获取mini-batch大小的 正采样对 (Xw,w) Xw为上下文id数组,w为目标词id。上下文步长为window_size,即2c = 2*window_size def get_wordId_list(self): self.input_file = open(self.input_file_name, encoding="utf-8") sentence = self.input_file.readline() wordId_list = [] # 一句中的所有word 对应的 id sentence = sentence.strip().split(' ') for i, word in enumerate(sentence): if i % 1000000 == 0: print(i, len(sentence)) try: word_id = self.word2id_dict[word] wordId_list.append(word_id) except: continue self.wordId_list = wordId_list def get_batch_pairs(self, batch_size, window_size): while len(self.word_pairs_queue) < batch_size: for _ in range(1000): if self.index == len(self.wordId_list): self.index = 0 wordId_w = self.wordId_list[self.index] context_ids = [] for i in range(max(self.index - window_size, 0), min(self.index + window_size + 1, len(self.wordId_list))): if self.index == i: # 上下文=中心词 跳过 continue context_ids.append(self.wordId_list[i]) self.word_pairs_queue.append((context_ids, wordId_w)) self.index += 1 result_pairs = [] # 返回mini-batch大小的正采样对 for _ in range(batch_size): result_pairs.append(self.word_pairs_queue.popleft()) return result_pairs def get_pairs(self, pos_pairs): neg_word_pair = [] pos_word_pair = [] for pair in pos_pairs: pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]]) neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]]) return pos_word_pair, neg_word_pair # 估计数据中正采样对数,用于设定batch def evaluate_pairs_count(self, window_size): return self.word_count_sum * (2 * window_size - 1) - (self.sentence_count - 1) * (1 + window_size) * window_size
def __init__(self, content): self.header = content[:4096] self.content = content[4096:] self.get_frequencies() self.tree = HuffmanTree(self.frequencies)
freq = {} for char in input_string: if char not in freq: freq[char] = 1 else: freq[char] += 1 return freq input_string = open("input.txt", "r").read() probability = construct_probability(input_string) encoding_array = {} if len(probability) == 1: encoding_array.update({list(probability)[0]: '0'}) else: tree = HuffmanTree() tree.get_nodes_heap(probability) tree.construct_tree() encoding_array = tree.get_codes() # Encode output_string = "" for char in input_string: output_string += encoding_array[char] ouput = open('binary_output.bin', 'w') ouput.write(output_string) ouput.close() # Decode decoder = {} for key in encoding_array:
def build_tree(self): self.tree = HuffmanTree(self.frequencies)
class InputData: def __init__(self, input_file_name, min_count): self.input_file_name = input_file_name self.input_file = open(self.input_file_name, encoding='utf-8') # 数据文件 self.min_count = min_count # 要淘汰的低频数据的频度 self.wordId_frequency_dict = dict() # 词id-出现次数 dict self.word_count = 0 # 单词数(重复的词只算1个) self.word_count_sum = 0 # 单词总数 (重复的词 次数也累加) self.id2word_dict = dict() # 词id-词 dict self.word2id_dict = dict() # 词-词id dict self.word_sequence = list() self._init_dict() # 初始化字典 self.huffman_tree = HuffmanTree(self.wordId_frequency_dict) # 霍夫曼树 self.huffman_pos_path, self.huffman_neg_path = self.huffman_tree.get_all_pos_and_neg_path( ) self.word_pairs_queue = deque() # 结果展示 print('Word Count is:', self.word_count) print('Word Count Sum is', self.word_count_sum) #print('Tree Node is:', len(self.huffman_tree.huffman)) def _init_dict(self): word_freq = dict() # 统计 word_frequency for line in self.input_file: line = " ".join(line.split('\n')) line = re.sub(",", "", line) line = re.split('\.', line) line = " ".join(line).split() self.word_count_sum += len(line) self.word_sequence += line for word in line: try: word_freq[word] += 1 except: word_freq[word] = 1 word_id = 0 # 初始化 word2id_dict,id2word_dict, wordId_frequency_dict字典 for per_word, per_count in word_freq.items(): if per_count < self.min_count: # 去除低频 self.word_count_sum -= per_count continue self.id2word_dict[word_id] = per_word self.word2id_dict[per_word] = word_id self.wordId_frequency_dict[word_id] = per_count word_id += 1 self.word_count = len(self.word2id_dict) # 获取mini-batch大小的 正采样对 (Xw,w) Xw为上下文id数组,w为目标词id。上下文步长为window_size,即2c = 2*window_size def get_batch_pairs(self, batch_size, window_size): wordId_list = [] # 一句中的所有word 对应的 id for word in self.word_sequence: try: word_id = self.word2id_dict[word] wordId_list.append(word_id) except: continue # 寻找正采样对 (context(w),w) 加入正采样队列 for i, wordId_w in enumerate(wordId_list): context_ids = [] for j, wordId_u in enumerate( wordId_list[max(i - window_size, 0):i + window_size + 1]): j += max(i - window_size, 0) assert wordId_w < self.word_count assert wordId_u < self.word_count if i == j: # 上下文=中心词 跳过 continue elif max(0, i - window_size + 1) <= j <= min( len(wordId_list), i + window_size - 1): context_ids.append(wordId_u) if len(context_ids) == 0: continue self.word_pairs_queue.append((context_ids, wordId_w)) result_pairs = [] # 返回mini-batch大小的正采样对 for _ in range(batch_size): a = random.choice(self.word_pairs_queue) result_pairs.append(a) return result_pairs def get_pairs(self, pos_pairs): neg_word_pair = [] pos_word_pair = [] for pair in pos_pairs: a = [pair[0]] b = [pair[1]] pos_word_pair += zip([pair[0]] * len(self.huffman_pos_path[pair[1]]), self.huffman_pos_path[pair[1]]) neg_word_pair += zip([pair[0]] * len(self.huffman_neg_path[pair[1]]), self.huffman_neg_path[pair[1]]) return pos_word_pair, neg_word_pair