Ejemplo n.º 1
0
def compress(text):
    timestamp = 0
    heap = []
    counter = OrderedDict(sorted(Counter(text).items()))

    for char, frequency in counter.items():
        node = HuffmanTree(frequency, timestamp, char)
        heapq.heappush(heap, node)
        timestamp += 1

    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        node = HuffmanTree(left.priority + right.priority, timestamp, None)
        node.left = left
        node.right = right
        heapq.heappush(heap, node)
        timestamp += 1

    huffman_tree = heapq.heappop(heap)
    huffman_codes = {}
    traverse(huffman_tree, BitArray(), huffman_codes)

    compressed = BitArray()
    for char in text:
        compressed += huffman_codes[char]

    return compressed, huffman_tree
Ejemplo n.º 2
0
def encode_dna_strings(fnpat, rootdir, encdir):
    ## your code
    for x in generate_file_names(fnpat, rootdir):
        cfm2 = CharFreqMap.computeCharFreqMap(work_dir + rootdir + x)
        nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm2)
        ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes)
        bht = BinHuffmanTree(root=ht.getRoot())
        bht.encodeTextFromFileToFile(work_dir + rootdir + x,
                                     work_dir + encdir + x)
Ejemplo n.º 3
0
def unit_test_04():
    cfm1 = CharFreqMap.computeCharFreqMap(work_dir + 'moby_dick_ch01.txt')
    nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm1)
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes)
    bht = BinHuffmanTree(root=ht.getRoot())
    bht.encodeTextFromFileToFile(work_dir + 'moby_dick_ch01.txt',
                                 work_dir + 'moby_dick_ch01')
    with open(work_dir + 'moby_dick_ch01.txt', 'r') as inf:
        data = inf.read()
        dec0 = bht.decodeTextFromFile(work_dir + 'moby_dick_ch01')
        assert dec0 == data
        print('Assertion passed!')
Ejemplo n.º 4
0
def get_HuffmanCodePath(filename, separate=" "):
    nodes = filename + '_nodes.pkl'
    codename = filename + '_code.pkl'
    pathname = filename + '_path.pkl'
    filepath = data_path / filename / 'train.txt'
    nodecache = cache_path / nodes
    codecache = cache_path / codename
    pathcache = cache_path / pathname

    if (codecache.exists()) and (pathcache.exists()) and (nodecache.exists()):
        huffman_nodes = pickle.load(open(str(nodecache), 'rb'))
        huffman_codes = pickle.load(open(str(codecache), 'rb'))
        huffman_paths = pickle.load(open(str(pathcache), 'rb'))
    else:
        wordlist = []
        with open(str(filepath), 'r') as f:
            for line in f:
                wordlist += [
                    word for word in line.lower().strip().split(separate)
                ]

        wordlist = Counter(wordlist)

        chars_weights = list(wordlist.items())

        tree = HuffmanTree(chars_weights)
        huffman_codes = tree.huffman_code
        huffman_paths = tree.huffman_path
        huffman_nodes = tree.root._name

        pickle.dump(huffman_codes, open(str(codecache), 'wb'))
        pickle.dump(huffman_paths, open(str(pathcache), 'wb'))
        pickle.dump(huffman_nodes, open(str(nodecache), 'wb'))

    return huffman_nodes, huffman_codes, huffman_paths
Ejemplo n.º 5
0
def compress_file_splitted(compressor, filename):
    '''
	Funcao modificada do HuffmanCompressor, para retornar os
	bytes dos dados e da arvore em separado
	'''

    with open(filename, "rb") as src:

        txt = src.read()

        tree = HuffmanTree()
        codes, compressor.encoded_tree = tree.huffman_coding(txt)
        # print(codes)
        compressor.get_encoded_txt(txt, codes)

    src.close()

    return compressor.encoded_txt, compressor.encoded_tree
Ejemplo n.º 6
0
def unit_test_01():
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes)
    for s in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']:
        print(s + ' --> ' + ht.encodeSymbol(s))
    enc0 = ht.encodeText(txt0)
    enc1 = ht.encodeText(txt1)
    dec0 = ht.decode(enc0)
    dec1 = ht.decode(enc1)
    assert dec0 == txt0
    assert dec1 == txt1
    print('Assertions passed!')
Ejemplo n.º 7
0
    def compress_file(self, filename):
        '''
		filename(str) -> encode_arr(bytearray)

		------------
		This Function takes file name and returns its Huffman
		encode bytes array.
		'''

        with open(filename, "rb") as src:

            txt = src.read()

            tree = HuffmanTree()
            codes, self.encoded_tree = tree.huffman_coding(txt)
            # print(codes)
            self.get_encoded_txt(txt, codes)

        src.close()

        return self.encoded_arr(self.encoded_txt, self.encoded_tree, filename)
Ejemplo n.º 8
0
def unit_test_02():
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes)
    bht = BinHuffmanTree(root=ht.getRoot())
    bin_enc0, num_bytes0, pad_bits0 = bht.encodeText(txt0)
    bin_enc1, num_bytes1, pad_bits1 = bht.encodeText(txt1)
    print(bin_enc0, pad_bits0)
    print(bin_enc1, pad_bits1)
    dec0 = bht.decode(bin_enc0, pad_bits0)
    dec1 = bht.decode(bin_enc1, pad_bits1)
    assert dec0 == txt0
    assert dec1 == txt1
    print('Assertions passed!')
Ejemplo n.º 9
0
    def compress(self, src, target):
        """After initializing huffman Object, a call to this method
        will compress the file pointed to by the member variable path.

        Write bytes object to the target file.
        Returns:
            tuple (double, double): Contains information about comprission ratios
        """

        t1 = time.time()
        with open(src, "rb") as f:
            # Read text from file
            text = f.read()

        if len(text) == 0:
            raise EOFError("Empty file")

        # Calculate frequency and build Huffman Tree
        tree = HuffmanTree()
        codes = tree.initializeTree(text)

        # print("dictionary size: ", sys.getsizeof(codes))
        # Code the text and convert it into an array of bytes and write it to file
        codedText = self.getCodedFile(text, codes)
        codedTree = tree.encodeTree()

        # print("tree size: ", sys.getsizeof(codedTree))
        #
        # print("Saved tree to file:", codedTree.decode("utf-8"))
        byteArr = self.convertToByteArr(codedText, codedTree, target)

        with open(self.dest, "ab") as f2:
            f2.write(bytes(byteArr))

        t2 = time.time()
        self.outputCodes(codes, fileIO.fileName(src))

        t = t2 - t1
        return t
Ejemplo n.º 10
0
def unit_test_00():
    hnodes = [
        HuffmanTreeNode(symbols=set([kv[0]]), weight=kv[1])
        for kv in [('A', 4), ('B', 3), ('C', 1), ('D', 1)]
    ]
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(hnodes)
    print(str(ht.getRoot()))
    print(str(ht.getRoot().getLeftChild()))
    print(str(ht.getRoot().getRightChild()))
    print(str(ht.getRoot().getRightChild().getLeftChild()))
    print(str(ht.getRoot().getRightChild().getLeftChild().getLeftChild()))
    print(str(ht.getRoot().getRightChild().getLeftChild().getRightChild()))
    print(str(ht.getRoot().getRightChild().getRightChild()))
Ejemplo n.º 11
0
    def Train_Model(self, text_list):

        # generate the word_dict and huffman tree
        if self.huffman == None:  #HuffmanTree
            # if the dict is not loaded, it will generate a new dict
            if self.word_dict == None:
                wc = WordCounter(text_list)
                self.__Gnerate_Word_Dict(wc.count_res.larger_than(5))
                self.cutted_text_list = wc.text_list

            # generate a huffman tree according to the possibility of words
            self.huffman = HuffmanTree(self.word_dict, vec_len=self.vec_len)
        print(
            'word_dict and huffman tree already generated, ready to train vector'
        )

        # start to train word vector
        # win_len = 5,vec_len = 15000
        before = (self.win_len - 1) >> 1  #2
        after = self.win_len - 1 - before  #2

        if self.model == 'cbow':
            method = self.__Deal_Gram_CBOW
        else:
            method = self.__Deal_Gram_SkipGram

        if self.cutted_text_list:
            # if the text has been cutted
            total = self.cutted_text_list.__len__()
            count = 0
            for line in self.cutted_text_list:
                line_len = line.__len__()
                for i in range(line_len):
                    #取前后各两位的字
                    method(
                        line[i], line[max(0, i - before):i] +
                        line[i + 1:min(line_len, i + after + 1)])
                count += 1
                print('{c} of {d}'.format(c=count, d=total))
        else:
            # if the text has not been cutted
            for line in text_list:
                line = list(jieba.cut(line, cut_all=False))
                line_len = line.__len__()
                for i in range(line_len):
                    method(
                        line[i], line[max(0, i - before):i] +
                        line[i + 1:min(line_len, i + after + 1)])
        print('word vector has been generated')
Ejemplo n.º 12
0
def unit_test_03():
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(ht_nodes)
    bht = BinHuffmanTree(root=ht.getRoot())
    with open(work_dir + 'test_txt0.txt', 'w') as of:
        of.write(txt0)
        of.flush()
    bht.encodeTextToFile(txt0, work_dir + 'test_txt0')
    with open(work_dir + 'test_txt1.txt', 'w') as of:
        of.write(txt1)
        of.flush()
    bht.encodeTextToFile(txt1, work_dir + 'test_txt1')
    dec0 = bht.decodeTextFromFile(work_dir + 'test_txt0')
    dec1 = bht.decodeTextFromFile(work_dir + 'test_txt1')
    assert txt0 == dec0
    assert txt1 == dec1
    print('Assertions passed!')
Ejemplo n.º 13
0
    def zipFile(self, originalFileName, zipFileName):
        zipFileWrite = open(zipFileName, 'wb')

        with open(originalFileName, 'rb') as originalFile:
            originalData = originalFile.read()
        # print(type(originalData))
        # print(len(originalData))
        # print(originalDataLength)
        # for i in range(originalDataLength):
        #     print(originalData[i])
        intValueWeightDict = {}  # 统计原始文件中的各个字节出现的次数即weight
        for i in range(len(originalData)):
            if not (originalData[i] in intValueWeightDict.keys()):
                intValueWeightDict[originalData[i]] = 1
            else:
                intValueWeightDict[
                    originalData[i]] = intValueWeightDict[originalData[i]] + 1
        # 构造初始HuffmanTree,每个字节为一个Tree
        huffmanTreeList = []
        for intValue in intValueWeightDict:
            # print(byteValue)
            # print(byteNumDict[byteValue])
            huffmanTree = HuffmanTree(rootFlag=0,
                                      value=intValue,
                                      weight=intValueWeightDict[intValue])
            # print(huffmanTree.getWeight())
            huffmanTreeList.append(huffmanTree)

        # 调用BuildHuffmanTree构造一个完整的huffmanTree
        huffmanTreeOperation = HuffmanTreeOperation()
        huffmanTree = huffmanTreeOperation.getHuffmanTree(huffmanTreeList)

        # 存储huffmanTree中的各个字节的int型value对应的字符串编码
        # {intValue:huffmanCode}
        huffmanCodeDict = huffmanTreeOperation.getHuffmanCode(
            huffmanTree=huffmanTree, huffmanCodeDict={}, binaryCode="")

        # 存储原始文件总的字符的个数信息,方便解压缩
        originalDataLength = len(intValueWeightDict.keys())
        originalDataLen_0 = originalDataLength & 255  # 最低8位
        originalDataLength = originalDataLength >> 8
        originalDataLen_1 = originalDataLength & 255
        originalDataLength = originalDataLength >> 8
        originalDataLen_2 = originalDataLength & 255
        originalDataLength = originalDataLength >> 8
        originalDataLen_3 = originalDataLength & 255

        zipFileWrite.write(six.int2byte(originalDataLen_0))
        zipFileWrite.write(six.int2byte(originalDataLen_1))
        zipFileWrite.write(six.int2byte(originalDataLen_2))
        zipFileWrite.write(six.int2byte(originalDataLen_3))

        # 存储原始文件的weight信息
        for intValue in intValueWeightDict.keys():
            # 以byte形式存储原始数据value,占一个字节

            zipFileWrite.write(six.int2byte(intValue))

            # 以byte形式存储原始数据value对应的权重,占4个字节
            weight = intValueWeightDict[intValue]
            weight_0 = weight & 255
            weight = weight >> 8
            weight_1 = weight & 255
            weight = weight >> 8
            weight_2 = weight & 255
            weight = weight >> 8
            weight_3 = weight & 255

            zipFileWrite.write(six.int2byte(weight_0))
            zipFileWrite.write(six.int2byte(weight_1))
            zipFileWrite.write(six.int2byte(weight_2))
            zipFileWrite.write(six.int2byte(weight_3))

        binaryCode = ''
        for i in range(len(originalData)):
            intData = originalData[i]
            binaryCode = binaryCode + huffmanCodeDict[intData]
            outputValue = 0  # 8位一输出
            while len(binaryCode) > 8:
                for j in range(8):
                    outputValue = outputValue << 1
                    if binaryCode[j] == "1":
                        outputValue = outputValue | 1
                binaryCode = binaryCode[8:]

                zipFileWrite.write(six.int2byte(outputValue))

                outputValue = 0

        # 若最后有不满8位的binaryCode

        zipFileWrite.write(six.int2byte(len(binaryCode)))

        outputValue = 0
        for i in range(len(binaryCode)):
            outputValue = outputValue << 1
            if binaryCode[i] == "1":
                outputValue = outputValue | 1
        for i in range(8 - len(binaryCode)):
            # 补0,补全8位
            outputValue = outputValue << 1

        zipFileWrite.write(six.int2byte(outputValue))

        zipFileWrite.close()
Ejemplo n.º 14
0
    def unzipFile(self, zipFileName, unzipFileName):
        unzipFileWrite = open(unzipFileName, "wb")
        # 以二进制格式读取文件
        with open(zipFileName, "rb") as zipFile:
            zipFileData = zipFile.read()
        '''
        压缩文件结构:
        1. 4个byte的叶节点个数,低八位在前
        2. 各个叶节点的value值和其对应的weight(1个byte的value值,4个byte的weight(低八位在前))
           一共有第一步统计的数值的个数
        3:源文件的huffman码存储,8个凑为一个字节存储
           
        '''

        # 读取前四个字节,为原文件中字节的int型value的总个数,即huffmanTree的叶节点个数,低8位开始
        leafNodeNum_0 = zipFileData[0]  # 最低8位
        leafNodeNum_1 = zipFileData[1]
        leafNodeNum_2 = zipFileData[2]
        leafNodeNum_3 = zipFileData[3]

        # 计算叶节点个数
        leafNodeNum = 0
        leafNodeNum = leafNodeNum | leafNodeNum_3  #先计算高八位
        leafNodeNum = leafNodeNum << 8
        leafNodeNum = leafNodeNum | leafNodeNum_2
        leafNodeNum = leafNodeNum << 8
        leafNodeNum = leafNodeNum | leafNodeNum_1
        leafNodeNum = leafNodeNum << 8
        leafNodeNum = leafNodeNum | leafNodeNum_0

        # 读取各个叶节点的value值和其对应的weight,存入intValueWeightDict
        # 从zipFileData[4]开始
        intValueWeightDict = {}
        for i in range(leafNodeNum):
            intValue = zipFileData[4 + i * 5 + 0]

            # 4个字节的权重,低八位在前
            weight_0 = zipFileData[4 + i * 5 + 1]
            weight_1 = zipFileData[4 + i * 5 + 2]
            weight_2 = zipFileData[4 + i * 5 + 3]
            weight_3 = zipFileData[4 + i * 5 + 4]
            # 计算weight
            weight = 0
            weight = weight | weight_3  # 先计算高八位
            weight = weight << 8
            weight = weight | weight_2
            weight = weight << 8
            weight = weight | weight_1
            weight = weight << 8
            weight = weight | weight_0

            intValueWeightDict[intValue] = weight

        # 根据得到的intValueWeightDict构建huffmanTree
        # 构造初始HuffmanTree,每个字节为一个Tree
        huffmanTreeList = []
        for intValue in intValueWeightDict:
            # print(byteValue)
            # print(byteNumDict[byteValue])
            huffmanTree = HuffmanTree(rootFlag=0,
                                      value=intValue,
                                      weight=intValueWeightDict[intValue])
            # print(huffmanTree.getWeight())
            huffmanTreeList.append(huffmanTree)
        # 调用BuildHuffmanTree构造一个完整的huffmanTree
        huffmanTreeOperation = HuffmanTreeOperation()
        huffmanTree = huffmanTreeOperation.getHuffmanTree(huffmanTreeList)
        # 存储huffmanTree中的各个字节的int型value对应的字符串编码
        # {intValue:huffmanCode}
        huffmanCodeDict = huffmanTreeOperation.getHuffmanCode(
            huffmanTree=huffmanTree, huffmanCodeDict={}, binaryCode="")

        # 对源文件压缩部分进行解压缩
        binaryCode = ""
        currentNode = huffmanTree.getRoot()
        for i in range(leafNodeNum * 5 + 4, len(zipFileData)):
            intValue = zipFileData[i]
            for j in range(8):
                if intValue & 128:
                    binaryCode = binaryCode + "1"
                else:
                    binaryCode = binaryCode + "0"
                intValue = intValue << 1

            #因为256个编码的huffman树最多8层,24个足够
            while len(binaryCode) > 24:
                if currentNode.isLeafNode():
                    unzipFileWrite.write(six.int2byte(currentNode.getValue()))
                    currentNode = huffmanTree.getRoot()

                if binaryCode[0] == "1":
                    currentNode = currentNode.getRightChild()
                else:
                    currentNode = currentNode.getLeftChild()
                binaryCode = binaryCode[1:]

        #处理最后24位
        subBinaryCode = binaryCode[-16:-8]
        lastLength = 0
        for i in range(8):
            lastLength = lastLength << 1
            if subBinaryCode[i] == "1":
                lastLength = lastLength | 1
        binaryCode = binaryCode[:-16] + binaryCode[-8:-8 + lastLength]
        while len(binaryCode) > 0:
            if currentNode.isLeafNode():
                unzipFileWrite.write(six.int2byte(currentNode.getValue()))
                currentNode = huffmanTree.getRoot()
            if binaryCode[0] == "1":
                currentNode = currentNode.getRightChild()
            else:
                currentNode = currentNode.getLeftChild()
            binaryCode = binaryCode[1:]

        if currentNode.isLeafNode():
            unzipFileWrite.write(six.int2byte(currentNode.getValue()))
            currentNode = huffmanTree.getRoot()

        unzipFileWrite.close()
Ejemplo n.º 15
0
    # Construct the variables for the NCE loss
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / math.sqrt(embedding_size)))
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
    #设置损失函数和优化算法
    # Compute the NCE loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
        tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
                       num_sampled, vocabulary_size))
    # We use the SGD optimizer.
    optimizer = tf.train.GradientDescentOptimizer(
        learning_rate=1.0).minimize(loss)
    #迭代训练
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for inputs, labels in generate_batch(batch_size, num_skips,
                                             skip_window):
            feed_dict = {train_inputs: inputs, train_labels: labels}
            _, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
            print("loss:", cur_loss)


if __name__ == '__main__':
    text_list = wc.load_txt('../static/wufazhangda.txt')
    WC = wc.WordCounter(text_list)
    c = WC.count_res
    # print(c)
    print(sum(c.values()))
    ht = HuffmanTree(c)
Ejemplo n.º 16
0
    def decompress(self, f):
        """After initializing huffman Object, a call to this method
        will read information from the file-header and decode the file accordingly.

        Write bytes object to the target file.

        Args:
            f (TemporaryFile): file to be compressed
        Returns:
            None
        """

        fd = f.fileno()

        ############################
        # READING HEADER INFORMATION
        ############################

        # Get number of extra bits that was added to last byte
        byte = os.read(fd, 1)
        bits = ord(byte)

        # Get the size of encoded Huffman-Tree, read it from header and reconstruct it
        byte = os.read(fd, 4)

        length = int.from_bytes(byte, byteorder="little")
        byte = os.read(fd, length)

        root = generateTree(bytearray(byte))
        tree = HuffmanTree(root=root)

        # Get the original file name and path (in case of folder compression)
        byte = os.read(fd, 4)

        ln = int.from_bytes(byte, byteorder="little")
        target = ""

        for i in range(ln):
            byte = os.read(fd, 1)
            target += byte.decode("utf-8")

        fileContent = os.read(fd, 4)

        fileContentLn = int.from_bytes(fileContent, "little")

        # read the rest of the file byte by byte
        # convert it to binary representation
        # to be decoded

        byte = os.read(fd, fileContentLn)
        hexa = byte.hex()
        bitsNo = "0" + str(fileContentLn * 8)
        formattedOutput = '{0:' + bitsNo + 'b}'
        output_bin = formattedOutput.format(int(hexa, 16))

        # Delete the extra bits
        if bits != 0:
            output_bin = output_bin[:-1 * bits]

        output = self.getDecodedFile(output_bin, tree)
        fileIO.create_path_nexist(target)

        with open(target, "wb") as f2:
            f2.write(output)
Ejemplo n.º 17
0
def unit_test_07():
    cfm = CharFreqMap.computeCharFreqMap(work_dir + 'moby_dick_ch02.txt')
    nodes = HuffmanTree.freqMapToListOfHuffmanTreeNodes(cfm)
    ht = HuffmanTree.fromListOfHuffmanTreeNodes(nodes)
    bht = BinHuffmanTree(root=ht.getRoot())
    bht.persist(work_dir + 'moby_dick_ch02_bht.bin')
Ejemplo n.º 18
0
    def train_model(self, text_list):
        before = (self.win - 1) >> 1
        after = self.win - 1 - before
        if self.method == 'hs':
            if self.huffman == None:
                if self.word_dict == None:
                    wc = WordCounter(text_list)
                    self.gen_word_dict(
                        wc.count_res.larger_than(2))  # get self.word_dict
                    self.cutted_text_list = wc.text
                self.huffman = HuffmanTree(self.word_dict,
                                           vec_len=self.vec_len)
            print("get word_dict and huffman tree, ready to train vector!")

            # start to train
            if self.model == 'cbow':
                print("==========CBOW===========")
                method = self.deal_cbow
            elif self.model == 'skip-gram':
                print("==========Skip-Gram===========")
                method = self.deal_skipGram
            if self.cutted_text_list:
                count = 0
                for line in self.cutted_text_list:
                    line_len = line.__len__()
                    for i in range(line_len):
                        method(
                            line[i], line[max(0, i - before):i] +
                            line[i + 1:min(line_len, i + after + i)])
                    count += 1
                    print('{c}/{d}'.format(c=count,
                                           d=self.cutted_text_list.__len__()))
            else:
                print("ERROR: cutted_text_list has not be generate!")
        else:
            if self.word_dict == None:
                wc = WordCounter(text_list)
                self.gen_word_dict(
                    wc.count_res.larger_than(2))  # get self.word_dict
                self.cutted_text_list = wc.text

            table = UnigramTable(self.word_dict)
            if self.cutted_text_list:
                for line in self.cutted_text_list:
                    syn1 = np.zeros([
                        1, self.vec_len
                    ])  # vocab_size = self.word_dict.__len__()
                    line_len = line.__len__()
                    for i in range(line_len):
                        gram_word_list = line[max(0, i - before):i] + line[
                            i + 1:min(line_len, i + after + i)]
                        for i in range(gram_word_list.__len__())[::-1]:
                            if not self.word_dict.__contains__(
                                    gram_word_list[i]):
                                gram_word_list.pop(i)
                        if gram_word_list.__len__() == 0:
                            return
                        # print (gram_word_list)
                        neu1 = np.mean(
                            np.array([
                                self.word_dict[word]['vector']
                                for word in gram_word_list
                            ]),
                            axis=0)  # syn0: self.word_dict[word]['vector']
                        neu1e = np.zeros([1, self.vec_len])  # init e
                        classifiers = [(line[i], 1)] + [
                            (line[target], 0)
                            for target in table.sample(self.n_sampling)
                        ]  # 负采样有问题?
                        for target, label in classifiers:
                            # print (target)
                            # print (neu1.shape, syn1.shape)
                            q = self.sigmoid(np.dot(neu1, syn1.T))
                            g = self.learning_rate * (label - q)
                            neu1e += g * syn1
                            syn1 += g * neu1e
                        # update syn0
                        for gram_word in gram_word_list:
                            self.word_dict[gram_word]['vector'] += neu1e
        print("训练的词向量为:")
        for word, value in self.word_dict.items():
            print(word, value['vector'])
Ejemplo n.º 19
0
print time.time()

binarySearch(l, l[234])
print time.time()
"""

tree = BSTree()
nodes = []
for v in l:
    nodes.append(Node(v))
    tree.addChild(v)

""" 删除一个节点 """
tree.preorderWalk()
tree.removeNode(tree.header)
print
tree.preorderWalk()
# tree.storeTree("tree.json")
# print tree.findParent(tree.header.left.right)

""" 赫夫曼树 """
print
from HuffmanTree import HuffmanTree
ht = HuffmanTree(nodes)
ht.preorderWalk()





Ejemplo n.º 20
0
def buildTreeList(dict0, treelist0):
    for ch in dict0.keys():
        leaftree = HuffmanTree(0, ch, dict0[ch], None, None)#注意参数char和freq是int
        treelist0.append(leaftree)