def get_fen_result(zz): all_sen=[] data=[] sentences = cut_sent(zz) for sent in sentences: sent = sent.replace("\n", "") sent = sent.replace("\t", "") sent = sent.replace(" ", "") if sent: all_sen.append(sent) for line in all_sen: word_list = [x for x in jieba.cut(line.strip())] data.append(word_list) if os.path.exists(root_name): root = load_model(root_name) else: dict_name = FLAGS.data_path + 'dict.txt' word_freq = load_dictionary(dict_name) root = TrieNode('*', word_freq) save_model(root, root_name) for word_list in data: ngrams = generate_ngram(word_list, FLAGS.ngram) for d in ngrams: root.add(d) te_re, add_word = root.find_word(FLAGS.topN, stop_word, jieba_dict, l_zre) del root return te_re
def load_data_2_root(data): print('------> 插入节点') for word_list in data: # tmp 表示每一行自由组合后的结果(n gram) # tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']] ngrams = generate_ngram(word_list, 3) for d in ngrams: root.add(d) print('------> 插入成功')
def handel_data(word_lists): _ngrams = [] for word_list in word_lists: _ngrams.append(generate_ngram(word_list, 3)) # print('_ngrams=====',_ngrams) if _ngrams: return np.concatenate(_ngrams) else: print('_ngrams=======null', word_lists) return [[]]
def loadDate2Root(data): print('------> 插入节点') for i in data: # tmp 表示每一行自由组合后的结果(n gram) # tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']] tmp = generate_ngram(i, 3) # print(tmp) for d in tmp: root.add(d) print('------> 插入成功')
def load_data_2_root(data): print('------> insert nodes') print("total data list:", len(data)) for i, word_list in enumerate(data): if i != 0 and i % 1000 == 0: print(i) ngrams = generate_ngram(word_list, 3) for d in ngrams: root.add(d) print('------> insert successfully')
def load_data_2_root(data): #传入经过粗略分词 print('------> 插入节点') #对于每一行句子进行n-gram的组合 for word_list in data: # tmp 表示每一行自由组合后的结果(n gram) # tmp: [['它'], ['是'], ['小'], ['狗'], ['它', '是'], ['是', '小'], ['小', '狗'], ['它', '是', '小'], ['是', '小', '狗']] ngrams = generate_ngram(word_list, 3) #建立存储这些词汇的字典树,存储词汇出现的次数 for d in ngrams: root.add(d) print('------> 插入成功')
def load_data_2_root(data): print('------> 插入节点') for word_list in data: # tmp 表示每一行自由组合后的结果(n gram) # 雪落/ 山庄/ 不是/ 一座/ 山庄/ 只是/ 一个/ 客栈 # tmp: [['雪落'], ['山庄'], ['不是'], ['一座'],['山庄'],['只是'],['一个'],['客栈'], # ['雪落', '山庄'], ['山庄', '不是'], ['不是', '一座'],['一座','山庄'],['山庄','只是'],['只是','一个'],['一个','客栈'] # ['雪落', '山庄', '不是'], ['山庄', '不是', '一座'],[不是'','一座','山庄'],... ngrams = generate_ngram(word_list, 3) #print(ngrams) for d in ngrams: root.add(d) print('------> 插入成功')
# 加载数据集 data = [] with open('../data/demo.txt', 'r') as f: lines = f.readlines() for line in lines: line = line.strip() line = [x for x in jieba.cut(line, cut_all=False) if x not in stopword] data.append(line) print('------> 初始化字典树') root = TrieNode('*', word_freq) print('------> 插入节点') for i in data: tmp = generate_ngram(i, 3) for d in tmp: root.add(d) result, add_word = root.wordFind(5) print('增加了%d个新词, 词语和得分分别为' % len(add_word)) print('#############################') for word, score in add_word.items(): print(word + ' ----> ', score) print('#############################') # 如果想要调试和选择其他的阈值,可以print result来调整 # print(result) test = '蔡英文在昨天应民进党当局的邀请,准备和陈时中一道前往世界卫生大会,和谈有关九二共识问题'