def get_fen_result(zz): all_sen=[] data=[] sentences = cut_sent(zz) for sent in sentences: sent = sent.replace("\n", "") sent = sent.replace("\t", "") sent = sent.replace(" ", "") if sent: all_sen.append(sent) for line in all_sen: word_list = [x for x in jieba.cut(line.strip())] data.append(word_list) if os.path.exists(root_name): root = load_model(root_name) else: dict_name = FLAGS.data_path + 'dict.txt' word_freq = load_dictionary(dict_name) root = TrieNode('*', word_freq) save_model(root, root_name) for word_list in data: ngrams = generate_ngram(word_list, FLAGS.ngram) for d in ngrams: root.add(d) te_re, add_word = root.find_word(FLAGS.topN, stop_word, jieba_dict, l_zre) del root return te_re
def run(self): starttime = time.time() rootName = (self.rootDir) if os.path.exists(rootName): root = loadModel(rootName) else: dictName = self.dictDir word_freq = loadWords(dictName) root = TrieNode('*', word_freq) saveModel(root, rootName) # 加载新的文章 fileName = self.demoDir data = self.loadData(fileName, self.stopwords) # 将新的文章插入到Root中 self.loadData2Root(root, data) # 定义取TOP5个 N = 5 result, add_word = root.wordFind(N) # 如果想要调试和选择其他的阈值,可以print result来调整 print("\n----\n", '增加了 %d 个新词, 词语和得分分别为: \n' % len(add_word)) print('#############################') for word, score in add_word.items(): print(word + ' ----> ', score) print('#############################\n') for word, score in add_word.items(): jieba.add_word(word) print("互信息、信息熵:") print("".join([(x + '/ ') for x in jieba.cut(self.test_text, cut_all=False) if x not in self.stopwords])) endtime = time.time() print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
def load_dic_tree(jieba_dic_path, PMI, is_save=True): ''' 加载字典树 :param jieba_dic_path: 结巴词典路径 :param PMI: 互信息阈值 :param is_save: 是否保存构建好的字典树,直接加载构建好的树可以节约时间 :return: 返回字典树 ''' Logger.log_DEBUG.debug('-----> 开始加载字典树') s_time = time.time() if is_save: try: word_freq = data_read.Load_word_freq(jieba_dic_path) root = TrieNode('*', PMI, word_freq) joblib.dump(root, 'tree.bin') time_elapse = time.time() - s_time Logger.log_DEBUG.debug("构建字典树完毕耗时: {}s".format(time_elapse)) except Exception as e: s = "构建字典树发生异常load_dic_tree" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) else: try: root = joblib.load('tree.bin') time_elapse = time.time() - s_time Logger.log_DEBUG.debug("加载字典树完毕耗时: {}s".format(time_elapse)) except Exception as e: s = "读取字典树发生异常load_dic_tree" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) return root
def create_root(rootName, dictName): if os.path.exists(rootName): root = loadModel(rootName) return root else: word_freq = loadWords(dictName) root = TrieNode('*', word_freq) saveModel(root, rootName) return root
print('------> 插入成功') if __name__ == "__main__": #root_name = basedir + "/data/root.pkl" root_name = basedir + "/data/jianzhu.pkl" stopwords = get_stopwords() if os.path.exists(root_name): root = load_model(root_name) else: #文档不能正确反映单个词的词频,所以引入Jieba自带的外部词典 dict_name = basedir + '/data/dict.txt' #读取字典文件,取出词频大于2的建立字典{单词:频数} word_freq = load_dictionary(dict_name) #建立词汇树 root = TrieNode('*', word_freq) save_model(root, root_name) # 加载新的文章 #filename = 'data/demo.txt' filename = 'data/jianzhu.txt' #data是二维数组,存储[[第一行list][第二行list].....] data = load_data(filename, stopwords) # 将新的文章插入到Root中 load_data_2_root(data) # 定义取TOP5个 topN = 5 result, add_word = root.find_word(topN) # 如果想要调试和选择其他的阈值,可以print result来调整 # print("\n----\n", result)
import jieba # 定义取TOP5个 N = 5 # 加载数据集 data = [] with open('../data/demo.txt', 'r') as f: lines = f.readlines() for line in lines: line = line.strip() line = [x for x in jieba.cut(line, cut_all=False) if x not in stopword] data.append(line) print('------> 初始化字典树') root = TrieNode('*', word_freq) print('------> 插入节点') for i in data: tmp = generate_ngram(i, 3) for d in tmp: root.add(d) result, add_word = root.wordFind(5) print('增加了%d个新词, 词语和得分分别为' % len(add_word)) print('#############################') for word, score in add_word.items(): print(word + ' ----> ', score) print('#############################')
fw2 = open(wordFreq_sorted_path, 'w', encoding='utf-8') for name, freq in word_freq_sorted.items(): fw2.write(name + ": " + str(freq) + '\n') fw1.close() fw2.close() if __name__ == "__main__": root_name = basedir + "/data/root.pkl" stopwords = get_stopwords() if os.path.exists(root_name): root = load_model(root_name) else: dict_name = basedir + '/data/dict.txt' word_freq = load_dictionary(dict_name) root = TrieNode('*', word_freq) save_model(root, root_name) #choose and modify paths below func('data/demo_bid_data.txt', 'data/add_word_bid_data.txt', 'data/wordFreq_bid_data.txt', 'data/wordFreq_sorted_bid_data.txt') print("finished 1st run...") func('data/demo_bid_data.txt', 'data/add_word_bid_data_3To5.txt', 'data/wordFreq_bid_data_3To5.txt', 'data/wordFreq_sorted_bid_data_3To5.txt') # 2nd run will help find out new words composed of 3-5 words. # import cProfile # cProfile.run("func()", filename="cpresult.out", sort="cumulative") # # import pstats # p = pstats.Stats("cpresult.out")