コード例 #1
0
    def run(self):
        starttime = time.time()
        rootName = (self.rootDir)

        if os.path.exists(rootName):
            root = loadModel(rootName)
        else:
            dictName = self.dictDir
            word_freq = loadWords(dictName)
            root = TrieNode('*', word_freq)
            saveModel(root, rootName)

        # 加载新的文章
        fileName = self.demoDir
        data = self.loadData(fileName, self.stopwords)
        # 将新的文章插入到Root中
        self.loadData2Root(root, data)

        # 定义取TOP5个
        N = 5
        result, add_word = root.wordFind(N)
        # 如果想要调试和选择其他的阈值,可以print result来调整
        print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word))
        print('#############################')
        for word, score in add_word.items():
            print(word + ' ---->  ', score)
        print('#############################\n')

        for word, score in add_word.items():
            jieba.add_word(word)

        print("互信息、信息熵:")
        print("".join([(x + '/ ') for x in jieba.cut(self.test_text, cut_all=False) if x not in self.stopwords]))
        endtime = time.time()
        print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
コード例 #2
0
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        line = [x for x in jieba.cut(line, cut_all=False) if x not in stopword]
        data.append(line)

print('------> 初始化字典树')
root = TrieNode('*', word_freq)

print('------> 插入节点')
for i in data:
    tmp = generate_ngram(i, 3)
    for d in tmp:
        root.add(d)

result, add_word = root.wordFind(5)

print('增加了%d个新词, 词语和得分分别为' % len(add_word))
print('#############################')
for word, score in add_word.items():
    print(word + ' ---->  ', score)
print('#############################')

# 如果想要调试和选择其他的阈值,可以print result来调整
# print(result)

test = '蔡英文在昨天应民进党当局的邀请,准备和陈时中一道前往世界卫生大会,和谈有关九二共识问题'
print('添加前:')
print("".join([(x + '/ ') for x in jieba.cut(test, cut_all=False)
               if x not in stopword]))