def compress(text): timestamp = 0 heap = [] counter = OrderedDict(sorted(Counter(text).items())) for char, frequency in counter.items(): node = HuffmanTree(frequency, timestamp, char) heapq.heappush(heap, node) timestamp += 1 while len(heap) > 1: left = heapq.heappop(heap) right = heapq.heappop(heap) node = HuffmanTree(left.priority + right.priority, timestamp, None) node.left = left node.right = right heapq.heappush(heap, node) timestamp += 1 huffman_tree = heapq.heappop(heap) huffman_codes = {} traverse(huffman_tree, BitArray(), huffman_codes) compressed = BitArray() for char in text: compressed += huffman_codes[char] return compressed, huffman_tree
def encode_dna_strings(fnpat, rootdir, encdir): ## your code for x in generate_file_names(fnpat, rootdir): cfm2 = CharFreqMap.computeCharFreqMap(work_dir + rootdir + x) nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm2) ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes) bht = BinHuffmanTree(root=ht.getRoot()) bht.encodeTextFromFileToFile(work_dir + rootdir + x, work_dir + encdir + x)
def unit_test_04(): cfm1 = CharFreqMap.computeCharFreqMap(work_dir + 'moby_dick_ch01.txt') nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm1) ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes) bht = BinHuffmanTree(root=ht.getRoot()) bht.encodeTextFromFileToFile(work_dir + 'moby_dick_ch01.txt', work_dir + 'moby_dick_ch01') with open(work_dir + 'moby_dick_ch01.txt', 'r') as inf: data = inf.read() dec0 = bht.decodeTextFromFile(work_dir + 'moby_dick_ch01') assert dec0 == data print('Assertion passed!')
def get_HuffmanCodePath(filename, separate=" "): nodes = filename + '_nodes.pkl' codename = filename + '_code.pkl' pathname = filename + '_path.pkl' filepath = data_path / filename / 'train.txt' nodecache = cache_path / nodes codecache = cache_path / codename pathcache = cache_path / pathname if (codecache.exists()) and (pathcache.exists()) and (nodecache.exists()): huffman_nodes = pickle.load(open(str(nodecache), 'rb')) huffman_codes = pickle.load(open(str(codecache), 'rb')) huffman_paths = pickle.load(open(str(pathcache), 'rb')) else: wordlist = [] with open(str(filepath), 'r') as f: for line in f: wordlist += [ word for word in line.lower().strip().split(separate) ] wordlist = Counter(wordlist) chars_weights = list(wordlist.items()) tree = HuffmanTree(chars_weights) huffman_codes = tree.huffman_code huffman_paths = tree.huffman_path huffman_nodes = tree.root._name pickle.dump(huffman_codes, open(str(codecache), 'wb')) pickle.dump(huffman_paths, open(str(pathcache), 'wb')) pickle.dump(huffman_nodes, open(str(nodecache), 'wb')) return huffman_nodes, huffman_codes, huffman_paths
def compress_file_splitted(compressor, filename): ''' Funcao modificada do HuffmanCompressor, para retornar os bytes dos dados e da arvore em separado ''' with open(filename, "rb") as src: txt = src.read() tree = HuffmanTree() codes, compressor.encoded_tree = tree.huffman_coding(txt) # print(codes) compressor.get_encoded_txt(txt, codes) src.close() return compressor.encoded_txt, compressor.encoded_tree
def unit_test_01(): ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes) for s in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']: print(s + ' --> ' + ht.encodeSymbol(s)) enc0 = ht.encodeText(txt0) enc1 = ht.encodeText(txt1) dec0 = ht.decode(enc0) dec1 = ht.decode(enc1) assert dec0 == txt0 assert dec1 == txt1 print('Assertions passed!')
def compress_file(self, filename): ''' filename(str) -> encode_arr(bytearray) ------------ This Function takes file name and returns its Huffman encode bytes array. ''' with open(filename, "rb") as src: txt = src.read() tree = HuffmanTree() codes, self.encoded_tree = tree.huffman_coding(txt) # print(codes) self.get_encoded_txt(txt, codes) src.close() return self.encoded_arr(self.encoded_txt, self.encoded_tree, filename)
def unit_test_02(): ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes) bht = BinHuffmanTree(root=ht.getRoot()) bin_enc0, num_bytes0, pad_bits0 = bht.encodeText(txt0) bin_enc1, num_bytes1, pad_bits1 = bht.encodeText(txt1) print(bin_enc0, pad_bits0) print(bin_enc1, pad_bits1) dec0 = bht.decode(bin_enc0, pad_bits0) dec1 = bht.decode(bin_enc1, pad_bits1) assert dec0 == txt0 assert dec1 == txt1 print('Assertions passed!')
def compress(self, src, target): """After initializing huffman Object, a call to this method will compress the file pointed to by the member variable path. Write bytes object to the target file. Returns: tuple (double, double): Contains information about comprission ratios """ t1 = time.time() with open(src, "rb") as f: # Read text from file text = f.read() if len(text) == 0: raise EOFError("Empty file") # Calculate frequency and build Huffman Tree tree = HuffmanTree() codes = tree.initializeTree(text) # print("dictionary size: ", sys.getsizeof(codes)) # Code the text and convert it into an array of bytes and write it to file codedText = self.getCodedFile(text, codes) codedTree = tree.encodeTree() # print("tree size: ", sys.getsizeof(codedTree)) # # print("Saved tree to file:", codedTree.decode("utf-8")) byteArr = self.convertToByteArr(codedText, codedTree, target) with open(self.dest, "ab") as f2: f2.write(bytes(byteArr)) t2 = time.time() self.outputCodes(codes, fileIO.fileName(src)) t = t2 - t1 return t
def unit_test_00(): hnodes = [ HuffmanTreeNode(symbols=set([kv[0]]), weight=kv[1]) for kv in [('A', 4), ('B', 3), ('C', 1), ('D', 1)] ] ht = HuffmanTree.fromListOfHuffmanTreeNodes(hnodes) print(str(ht.getRoot())) print(str(ht.getRoot().getLeftChild())) print(str(ht.getRoot().getRightChild())) print(str(ht.getRoot().getRightChild().getLeftChild())) print(str(ht.getRoot().getRightChild().getLeftChild().getLeftChild())) print(str(ht.getRoot().getRightChild().getLeftChild().getRightChild())) print(str(ht.getRoot().getRightChild().getRightChild()))
def Train_Model(self, text_list): # generate the word_dict and huffman tree if self.huffman == None: #HuffmanTree # if the dict is not loaded, it will generate a new dict if self.word_dict == None: wc = WordCounter(text_list) self.__Gnerate_Word_Dict(wc.count_res.larger_than(5)) self.cutted_text_list = wc.text_list # generate a huffman tree according to the possibility of words self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len) print( 'word_dict and huffman tree already generated, ready to train vector' ) # start to train word vector # win_len = 5,vec_len = 15000 before = (self.win_len - 1) >> 1 #2 after = self.win_len - 1 - before #2 if self.model == 'cbow': method = self.__Deal_Gram_CBOW else: method = self.__Deal_Gram_SkipGram if self.cutted_text_list: # if the text has been cutted total = self.cutted_text_list.__len__() count = 0 for line in self.cutted_text_list: line_len = line.__len__() for i in range(line_len): #取前后各两位的字 method( line[i], line[max(0, i - before):i] + line[i + 1:min(line_len, i + after + 1)]) count += 1 print('{c} of {d}'.format(c=count, d=total)) else: # if the text has not been cutted for line in text_list: line = list(jieba.cut(line, cut_all=False)) line_len = line.__len__() for i in range(line_len): method( line[i], line[max(0, i - before):i] + line[i + 1:min(line_len, i + after + 1)]) print('word vector has been generated')
def unit_test_03(): ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes) bht = BinHuffmanTree(root=ht.getRoot()) with open(work_dir + 'test_txt0.txt', 'w') as of: of.write(txt0) of.flush() bht.encodeTextToFile(txt0, work_dir + 'test_txt0') with open(work_dir + 'test_txt1.txt', 'w') as of: of.write(txt1) of.flush() bht.encodeTextToFile(txt1, work_dir + 'test_txt1') dec0 = bht.decodeTextFromFile(work_dir + 'test_txt0') dec1 = bht.decodeTextFromFile(work_dir + 'test_txt1') assert txt0 == dec0 assert txt1 == dec1 print('Assertions passed!')
def zipFile(self, originalFileName, zipFileName): zipFileWrite = open(zipFileName, 'wb') with open(originalFileName, 'rb') as originalFile: originalData = originalFile.read() # print(type(originalData)) # print(len(originalData)) # print(originalDataLength) # for i in range(originalDataLength): # print(originalData[i]) intValueWeightDict = {} # 统计原始文件中的各个字节出现的次数即weight for i in range(len(originalData)): if not (originalData[i] in intValueWeightDict.keys()): intValueWeightDict[originalData[i]] = 1 else: intValueWeightDict[ originalData[i]] = intValueWeightDict[originalData[i]] + 1 # 构造初始HuffmanTree,每个字节为一个Tree huffmanTreeList = [] for intValue in intValueWeightDict: # print(byteValue) # print(byteNumDict[byteValue]) huffmanTree = HuffmanTree(rootFlag=0, value=intValue, weight=intValueWeightDict[intValue]) # print(huffmanTree.getWeight()) huffmanTreeList.append(huffmanTree) # 调用BuildHuffmanTree构造一个完整的huffmanTree huffmanTreeOperation = HuffmanTreeOperation() huffmanTree = huffmanTreeOperation.getHuffmanTree(huffmanTreeList) # 存储huffmanTree中的各个字节的int型value对应的字符串编码 # {intValue:huffmanCode} huffmanCodeDict = huffmanTreeOperation.getHuffmanCode( huffmanTree=huffmanTree, huffmanCodeDict={}, binaryCode="") # 存储原始文件总的字符的个数信息,方便解压缩 originalDataLength = len(intValueWeightDict.keys()) originalDataLen_0 = originalDataLength & 255 # 最低8位 originalDataLength = originalDataLength >> 8 originalDataLen_1 = originalDataLength & 255 originalDataLength = originalDataLength >> 8 originalDataLen_2 = originalDataLength & 255 originalDataLength = originalDataLength >> 8 originalDataLen_3 = originalDataLength & 255 zipFileWrite.write(six.int2byte(originalDataLen_0)) zipFileWrite.write(six.int2byte(originalDataLen_1)) zipFileWrite.write(six.int2byte(originalDataLen_2)) zipFileWrite.write(six.int2byte(originalDataLen_3)) # 存储原始文件的weight信息 for intValue in intValueWeightDict.keys(): # 以byte形式存储原始数据value,占一个字节 zipFileWrite.write(six.int2byte(intValue)) # 以byte形式存储原始数据value对应的权重,占4个字节 weight = intValueWeightDict[intValue] weight_0 = weight & 255 weight = weight >> 8 weight_1 = weight & 255 weight = weight >> 8 weight_2 = weight & 255 weight = weight >> 8 weight_3 = weight & 255 zipFileWrite.write(six.int2byte(weight_0)) zipFileWrite.write(six.int2byte(weight_1)) zipFileWrite.write(six.int2byte(weight_2)) zipFileWrite.write(six.int2byte(weight_3)) binaryCode = '' for i in range(len(originalData)): intData = originalData[i] binaryCode = binaryCode + huffmanCodeDict[intData] outputValue = 0 # 8位一输出 while len(binaryCode) > 8: for j in range(8): outputValue = outputValue << 1 if binaryCode[j] == "1": outputValue = outputValue | 1 binaryCode = binaryCode[8:] zipFileWrite.write(six.int2byte(outputValue)) outputValue = 0 # 若最后有不满8位的binaryCode zipFileWrite.write(six.int2byte(len(binaryCode))) outputValue = 0 for i in range(len(binaryCode)): outputValue = outputValue << 1 if binaryCode[i] == "1": outputValue = outputValue | 1 for i in range(8 - len(binaryCode)): # 补0,补全8位 outputValue = outputValue << 1 zipFileWrite.write(six.int2byte(outputValue)) zipFileWrite.close()
def unzipFile(self, zipFileName, unzipFileName): unzipFileWrite = open(unzipFileName, "wb") # 以二进制格式读取文件 with open(zipFileName, "rb") as zipFile: zipFileData = zipFile.read() ''' 压缩文件结构: 1. 4个byte的叶节点个数,低八位在前 2. 各个叶节点的value值和其对应的weight(1个byte的value值,4个byte的weight(低八位在前)) 一共有第一步统计的数值的个数 3:源文件的huffman码存储,8个凑为一个字节存储 ''' # 读取前四个字节,为原文件中字节的int型value的总个数,即huffmanTree的叶节点个数,低8位开始 leafNodeNum_0 = zipFileData[0] # 最低8位 leafNodeNum_1 = zipFileData[1] leafNodeNum_2 = zipFileData[2] leafNodeNum_3 = zipFileData[3] # 计算叶节点个数 leafNodeNum = 0 leafNodeNum = leafNodeNum | leafNodeNum_3 #先计算高八位 leafNodeNum = leafNodeNum << 8 leafNodeNum = leafNodeNum | leafNodeNum_2 leafNodeNum = leafNodeNum << 8 leafNodeNum = leafNodeNum | leafNodeNum_1 leafNodeNum = leafNodeNum << 8 leafNodeNum = leafNodeNum | leafNodeNum_0 # 读取各个叶节点的value值和其对应的weight,存入intValueWeightDict # 从zipFileData[4]开始 intValueWeightDict = {} for i in range(leafNodeNum): intValue = zipFileData[4 + i * 5 + 0] # 4个字节的权重,低八位在前 weight_0 = zipFileData[4 + i * 5 + 1] weight_1 = zipFileData[4 + i * 5 + 2] weight_2 = zipFileData[4 + i * 5 + 3] weight_3 = zipFileData[4 + i * 5 + 4] # 计算weight weight = 0 weight = weight | weight_3 # 先计算高八位 weight = weight << 8 weight = weight | weight_2 weight = weight << 8 weight = weight | weight_1 weight = weight << 8 weight = weight | weight_0 intValueWeightDict[intValue] = weight # 根据得到的intValueWeightDict构建huffmanTree # 构造初始HuffmanTree,每个字节为一个Tree huffmanTreeList = [] for intValue in intValueWeightDict: # print(byteValue) # print(byteNumDict[byteValue]) huffmanTree = HuffmanTree(rootFlag=0, value=intValue, weight=intValueWeightDict[intValue]) # print(huffmanTree.getWeight()) huffmanTreeList.append(huffmanTree) # 调用BuildHuffmanTree构造一个完整的huffmanTree huffmanTreeOperation = HuffmanTreeOperation() huffmanTree = huffmanTreeOperation.getHuffmanTree(huffmanTreeList) # 存储huffmanTree中的各个字节的int型value对应的字符串编码 # {intValue:huffmanCode} huffmanCodeDict = huffmanTreeOperation.getHuffmanCode( huffmanTree=huffmanTree, huffmanCodeDict={}, binaryCode="") # 对源文件压缩部分进行解压缩 binaryCode = "" currentNode = huffmanTree.getRoot() for i in range(leafNodeNum * 5 + 4, len(zipFileData)): intValue = zipFileData[i] for j in range(8): if intValue & 128: binaryCode = binaryCode + "1" else: binaryCode = binaryCode + "0" intValue = intValue << 1 #因为256个编码的huffman树最多8层,24个足够 while len(binaryCode) > 24: if currentNode.isLeafNode(): unzipFileWrite.write(six.int2byte(currentNode.getValue())) currentNode = huffmanTree.getRoot() if binaryCode[0] == "1": currentNode = currentNode.getRightChild() else: currentNode = currentNode.getLeftChild() binaryCode = binaryCode[1:] #处理最后24位 subBinaryCode = binaryCode[-16:-8] lastLength = 0 for i in range(8): lastLength = lastLength << 1 if subBinaryCode[i] == "1": lastLength = lastLength | 1 binaryCode = binaryCode[:-16] + binaryCode[-8:-8 + lastLength] while len(binaryCode) > 0: if currentNode.isLeafNode(): unzipFileWrite.write(six.int2byte(currentNode.getValue())) currentNode = huffmanTree.getRoot() if binaryCode[0] == "1": currentNode = currentNode.getRightChild() else: currentNode = currentNode.getLeftChild() binaryCode = binaryCode[1:] if currentNode.isLeafNode(): unzipFileWrite.write(six.int2byte(currentNode.getValue())) currentNode = huffmanTree.getRoot() unzipFileWrite.close()
# Construct the variables for the NCE loss nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) nce_biases = tf.Variable(tf.zeros([vocabulary_size])) #设置损失函数和优化算法 # Compute the NCE loss, using a sample of the negative labels each time. loss = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, num_sampled, vocabulary_size)) # We use the SGD optimizer. optimizer = tf.train.GradientDescentOptimizer( learning_rate=1.0).minimize(loss) #迭代训练 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for inputs, labels in generate_batch(batch_size, num_skips, skip_window): feed_dict = {train_inputs: inputs, train_labels: labels} _, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict) print("loss:", cur_loss) if __name__ == '__main__': text_list = wc.load_txt('../static/wufazhangda.txt') WC = wc.WordCounter(text_list) c = WC.count_res # print(c) print(sum(c.values())) ht = HuffmanTree(c)
def decompress(self, f): """After initializing huffman Object, a call to this method will read information from the file-header and decode the file accordingly. Write bytes object to the target file. Args: f (TemporaryFile): file to be compressed Returns: None """ fd = f.fileno() ############################ # READING HEADER INFORMATION ############################ # Get number of extra bits that was added to last byte byte = os.read(fd, 1) bits = ord(byte) # Get the size of encoded Huffman-Tree, read it from header and reconstruct it byte = os.read(fd, 4) length = int.from_bytes(byte, byteorder="little") byte = os.read(fd, length) root = generateTree(bytearray(byte)) tree = HuffmanTree(root=root) # Get the original file name and path (in case of folder compression) byte = os.read(fd, 4) ln = int.from_bytes(byte, byteorder="little") target = "" for i in range(ln): byte = os.read(fd, 1) target += byte.decode("utf-8") fileContent = os.read(fd, 4) fileContentLn = int.from_bytes(fileContent, "little") # read the rest of the file byte by byte # convert it to binary representation # to be decoded byte = os.read(fd, fileContentLn) hexa = byte.hex() bitsNo = "0" + str(fileContentLn * 8) formattedOutput = '{0:' + bitsNo + 'b}' output_bin = formattedOutput.format(int(hexa, 16)) # Delete the extra bits if bits != 0: output_bin = output_bin[:-1 * bits] output = self.getDecodedFile(output_bin, tree) fileIO.create_path_nexist(target) with open(target, "wb") as f2: f2.write(output)
def unit_test_07(): cfm = CharFreqMap.computeCharFreqMap(work_dir + 'moby_dick_ch02.txt') nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm) ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes) bht = BinHuffmanTree(root=ht.getRoot()) bht.persist(work_dir + 'moby_dick_ch02_bht.bin')
def train_model(self, text_list): before = (self.win - 1) >> 1 after = self.win - 1 - before if self.method == 'hs': if self.huffman == None: if self.word_dict == None: wc = WordCounter(text_list) self.gen_word_dict( wc.count_res.larger_than(2)) # get self.word_dict self.cutted_text_list = wc.text self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len) print("get word_dict and huffman tree, ready to train vector!") # start to train if self.model == 'cbow': print("==========CBOW===========") method = self.deal_cbow elif self.model == 'skip-gram': print("==========Skip-Gram===========") method = self.deal_skipGram if self.cutted_text_list: count = 0 for line in self.cutted_text_list: line_len = line.__len__() for i in range(line_len): method( line[i], line[max(0, i - before):i] + line[i + 1:min(line_len, i + after + i)]) count += 1 print('{c}/{d}'.format(c=count, d=self.cutted_text_list.__len__())) else: print("ERROR: cutted_text_list has not be generate!") else: if self.word_dict == None: wc = WordCounter(text_list) self.gen_word_dict( wc.count_res.larger_than(2)) # get self.word_dict self.cutted_text_list = wc.text table = UnigramTable(self.word_dict) if self.cutted_text_list: for line in self.cutted_text_list: syn1 = np.zeros([ 1, self.vec_len ]) # vocab_size = self.word_dict.__len__() line_len = line.__len__() for i in range(line_len): gram_word_list = line[max(0, i - before):i] + line[ i + 1:min(line_len, i + after + i)] for i in range(gram_word_list.__len__())[::-1]: if not self.word_dict.__contains__( gram_word_list[i]): gram_word_list.pop(i) if gram_word_list.__len__() == 0: return # print (gram_word_list) neu1 = np.mean( np.array([ self.word_dict[word]['vector'] for word in gram_word_list ]), axis=0) # syn0: self.word_dict[word]['vector'] neu1e = np.zeros([1, self.vec_len]) # init e classifiers = [(line[i], 1)] + [ (line[target], 0) for target in table.sample(self.n_sampling) ] # 负采样有问题? for target, label in classifiers: # print (target) # print (neu1.shape, syn1.shape) q = self.sigmoid(np.dot(neu1, syn1.T)) g = self.learning_rate * (label - q) neu1e += g * syn1 syn1 += g * neu1e # update syn0 for gram_word in gram_word_list: self.word_dict[gram_word]['vector'] += neu1e print("训练的词向量为:") for word, value in self.word_dict.items(): print(word, value['vector'])
print time.time() binarySearch(l, l[234]) print time.time() """ tree = BSTree() nodes = [] for v in l: nodes.append(Node(v)) tree.addChild(v) """ 删除一个节点 """ tree.preorderWalk() tree.removeNode(tree.header) print tree.preorderWalk() # tree.storeTree("tree.json") # print tree.findParent(tree.header.left.right) """ 赫夫曼树 """ print from HuffmanTree import HuffmanTree ht = HuffmanTree(nodes) ht.preorderWalk()
def buildTreeList(dict0, treelist0): for ch in dict0.keys(): leaftree = HuffmanTree(0, ch, dict0[ch], None, None)#注意参数char和freq是int treelist0.append(leaftree)