def siphon_best_match_from_set(sentence, set): corpora_documents = [] for item_text in set: item_seg = list(jieba_nlp.generate_jieba_cut(item_text)) corpora_documents.append(item_seg) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] similarity = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index', corpus_tfidf, num_features=600) test_cut_raw_1 = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) similarity.num_best = 2 test_corpus_tfidf_1 = tfidf_model[test_corpus_1] tfidf_simi = similarity[test_corpus_tfidf_1] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index', corpus_lsi, num_features=400, num_best=2) test_cut_raw_3 = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus_3 = dictionary.doc2bow(test_cut_raw_3) test_corpus_tfidf_3 = tfidf_model[test_corpus_3] test_corpus_lsi_3 = lsi[test_corpus_tfidf_3] # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值 lsi_simi = similarity_lsi[test_corpus_lsi_3] return {'tfidf': tfidf_simi, 'lsi_simi': lsi_simi}
def gensim_tfidf_simi(sentence, features=400, best=2): """ 文本相似度匹配,存在未数据未对齐异常 :param sentence: :param features: :param best: :return: """ rows = decode_rows_pickle() corpora_documents = [] for row in rows: item = list(jieba_nlp.generate_jieba_cut(row)) corpora_documents.append(item) dictionary = corpora.Dictionary(corpora_documents) dictionary.compactify() corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] similarity = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index', corpus_tfidf, num_features=features, num_best=best) cut_raw = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus = dictionary.doc2bow(cut_raw) test_corpus_tfidf = tfidf_model[test_corpus] return similarity[test_corpus_tfidf]
def generate_answer_by_mode(question, keywords, mode='jieba'): if keywords == question and len(question) > 5: question_auto = generate_qa_data_by_question(question) else: question_auto = generate_qa_data_by_question(keywords, True) # 如果查询不到结果 if question_auto.count() < 1: question_auto = generate_qa_data_by_question(question, True) # 问答相似度计算 question_origin = question if mode == 'ltp': ques = ltp_nlp.generate_segment_after_remove_stop_words(question) ques = remove_special_tags(ques) elif mode == 'jieba': ques = jieba_nlp.generate_jieba_cut(question) ques = remove_special_tags(ques) # ques = model.nlp_jieba_model.generate_jieba_cut(question) # ques = remove_special_tags(ques) for i in range(question_auto.count()): if question_auto[i].question: if mode == 'ltp': ques_ = ltp_nlp.generate_segment_after_remove_stop_words( question_auto[i].question) ques_ = remove_special_tags(ques_) elif mode == 'jieba': ques_ = jieba_nlp.generate_jieba_cut(question_auto[i].question) ques_ = remove_special_tags(ques_) # ques_ = model.nlp_jieba_model.generate_jieba_cut(question_auto[i].question) # ques_ = remove_special_tags(ques_) # print(ques_) try: simi = agwv.generate_sets_simi(ques, ques_) # simi = model.generate_word_vector.generate_sets_simi_by_self(ques, ques_) except Exception as error: # print(error) # words = str(error).split('\'') if question_origin == question_auto[i].question: question_auto[i].simi = '1' else: question_auto[i].simi = '0' continue # raise Exception("Exception:", error) else: question_auto[i].simi = str(simi)[0:5] del simi del ques_ agwv.clear_mem() question_auto = sorted(question_auto, key=lambda qa: qa.simi, reverse=True) return question_auto
def simlarity_tfidf(sentence): fenci = list(jieba_nlp.generate_jieba_cut(sentence)) dictionary = corpora.Dictionary.load(BASE_DIR + '/vendor/dataset/gensim/dict.txt') corpus = corpora.MmCorpus(BASE_DIR + '/vendor/dataset/gensim/corpuse.mm') # 加载 tfidf = models.TfidfModel.load(BASE_DIR + "/vendor/dataset/gensim/data.tfidf") corpus_tfidf = tfidf[corpus] vec_bow = dictionary.doc2bow(fenci) vec_tfidf = tfidf[vec_bow] index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] similarity = list(sims) simi_sets = {} if similarity: rows = decode_rows_pickle() simi_sets = {} i = 0 temp_simi = 0 for simi in similarity: if simi > temp_simi: temp = {'index': i, 'simi': simi, 'text': rows[i]} simi_sets.update(temp) temp_simi = simi i += 1 return simi_sets
def siphon_sentence_patial(sent): """ 句子成分抽取 :param sent: :return: """ if sent: segment = jieba_model.generate_jieba_cut(sent) print(segment) segment = ' '.join(segment) ner = stanford_model.generate_ner(segment) return ner
def siphon_words_with_tags(sentence=False, keywords=None, tags=None): """ 词性标注,输出dict :param sentence: :param keywords: :param tags: :return: """ word_dict = {} if sentence is False and keywords: if isinstance(keywords, list): keywords = ' '.join(keywords) if tags is None: tags = generate_postag(keywords) if tags: word_tags = tags.split() i = 0 for word_tag in word_tags: if word_tag.find("/") != -1: temp = word_tag.split("/") if temp[0] in word_dict: i += 1 if temp[0] + temp[1] in word_dict: word_dict[temp[0] + temp[1] * i] = temp[1] else: word_dict[temp[0] + temp[1]] = temp[1] else: word_dict[temp[0]] = temp[1] elif sentence and keywords is False: keywords = jieba_nlp.generate_jieba_cut(sentence, False, True) keywords = ' '.join(keywords) tags = generate_postag(keywords) if tags: word_tags = tags.split() i = 0 for word_tag in word_tags: if word_tag.find("/") != -1: temp = word_tag.split("/") if temp[0] in word_dict: i += 1 if temp[0] + temp[1] in word_dict: word_dict[temp[0] + temp[1] * i] = temp[1] else: word_dict[temp[0] + temp[1]] = temp[1] else: word_dict[temp[0]] = temp[1] return word_dict
def init_corpus_model(rows): corpora_documents = [] for row in rows: item = list(jieba_nlp.generate_jieba_cut(row)) corpora_documents.append(item) dictionary = corpora.Dictionary(corpora_documents) # save dictionary dictionary.save(BASE_DIR + '/vendor/dataset/gensim/dict.txt') corpus = [dictionary.doc2bow(text) for text in corpora_documents] # save corpus corpora.MmCorpus.serialize(BASE_DIR + '/vendor/dataset/gensim/corpuse.mm', corpus) tfidf_model = models.TfidfModel(corpus) # save model tfidf_model.save(BASE_DIR + "/vendor/dataset/gensim/data.tfidf") corpus_tfidf = tfidf_model[corpus] return corpus_tfidf
def batch_split_with_jieba(file_path, output=None): f1 = open(file_path, encoding="utf8") if output is None: output = file_path + '.jieba.txt' f2 = open(output, 'a', encoding="utf8") lines = f1.readlines() for line in lines: if line.strip() == '': continue line.replace('\t', '').replace('\n', '').replace(' ', '') seg_list = jieba_nlp.generate_jieba_cut(line, False, False) str_ = " ".join(seg_list) str_ = str_.replace(':', '').replace(':', '').replace('、', '').replace( '。', '').replace(',', '').replace('(', '').replace(')', '').replace( ';', '').replace('《', '').replace('》', '').replace('/', '') f2.write(str_) f1.close() f2.close()
def siphon_words_with_tags(sentence, keywords=None): word_dict = {} if keywords is None: keywords_ = jieba_nlp.generate_jieba_cut(sentence, False, True) keywords_ = " ".join(keywords_) keywords = keywords_.split() postag = generate_postag(sentence, keywords) i = 0 for keyword, pos in zip(keywords, postag): if keyword and pos: keyword = chinese_grammar.predicate_transfer(keyword) if keyword in word_dict: i += 1 if keyword + pos in word_dict: word_dict[keyword + pos * i] = pos else: word_dict[keyword + pos] = pos else: word_dict[keyword] = pos return word_dict
def gensim_lsi_simi(sentence, features=600, best=2): """ 文本相似度匹配,支持大数据 :param sentence: :param features: :param best: :return: """ dictionary = corpora.Dictionary.load(BASE_DIR + '/vendor/dataset/gensim/dict.txt') corpus = corpora.MmCorpus(BASE_DIR + '/vendor/dataset/gensim/corpuse.mm') tfidf_model = models.TfidfModel.load(BASE_DIR + "/vendor/dataset/gensim/data.tfidf") corpus_tfidf = tfidf_model[corpus] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index', corpus_lsi, num_features=features, num_best=best) test_cut_raw = list(jieba_nlp.generate_jieba_cut(sentence)) # 1.分词 test_corpus = dictionary.doc2bow(test_cut_raw) # 2.转换成bow向量 test_corpus_tfidf = tfidf_model[test_corpus] # 3.计算tfidf值 test_corpus_lsi = lsi[test_corpus_tfidf] # 4.计算lsi值 # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值 simi_sets = similarity_lsi[test_corpus_lsi] reponse = {} if simi_sets: rows = decode_rows_pickle() temp_simi = 0 for simi_set in simi_sets: if simi_set[1] > temp_simi: reponse['index'] = simi_set[0] reponse['simi'] = simi_set[1] reponse['text'] = rows[simi_set[0]] temp_simi = simi_set[1] return reponse
def generate_response(main_sent, keywords, ners, relations, type): """ 答案抽取 :param main_sent: :param keywords: :param ners: :param relations: :param type: :return: 第一步:FAQ主题词精确匹配 第二步:匹配中的候选问答评分 若 """ main_keywords_set = '' best_answer = '' if main_sent: main_keywords_set = jieba_nlp.generate_jieba_cut(main_sent) main_keywords_set = ' '.join(main_keywords_set) main_keywords_set = main_keywords_set.split() synonyms_words = mgaics.siphon_synonyms_words(main_keywords_set) if keywords: like_is = True if main_sent == keywords: like_is = False sets = fetch_sets_by_words(keywords, like_is) if sets: for i in range(len(list(sets))): if sets[i].question: ques_ = mgaics.remove_partial_and_special(sets[i].question) try: ques_keywords_set = jieba_nlp.generate_jieba_cut(ques_) ques_keywords_set = ' '.join(ques_keywords_set) ques_keywords_set = ques_keywords_set.split() ques_keywords_set = mgaics.replace_synonyms_words( main_keywords_set, ques_keywords_set, synonyms_words) simi = agwv.generate_sets_simi(ques_keywords_set, main_keywords_set) except Exception as error: if main_sent == ques_: sets[i].simi = '1' else: sets[i].simi = '0' continue else: sets[i].simi = str(simi)[0:5] del simi del ques_ agwv.clear_mem() # print(sets[i].question, sets[i].simi) agwv.clear_mem() sets = sorted(sets, key=lambda qa: qa.simi, reverse=True) best_answer = generate_best_answer(sets, main_sent) if best_answer is None and ners: sets = fetch_sets_by_words(ners, False) if sets: for i in range(len(list(sets))): if sets[i].question: ques_ = mgaics.remove_partial_and_special(sets[i].question) try: ques_keywords_set = jieba_nlp.generate_jieba_cut(ques_) ques_keywords_set = ' '.join(ques_keywords_set) ques_keywords_set = ques_keywords_set.split() ques_keywords_set = mgaics.replace_synonyms_words( main_keywords_set, ques_keywords_set, synonyms_words) simi = agwv.generate_sets_simi(ques_keywords_set, main_keywords_set) except Exception as error: if main_sent == ques_: sets[i].simi = '1' else: sets[i].simi = '0' continue else: sets[i].simi = str(simi)[0:5] del simi del ques_ agwv.clear_mem() # print(sets[i].question, sets[i].simi) agwv.clear_mem() sets = sorted(sets, key=lambda qa: qa.simi, reverse=True) best_answer = generate_best_answer(sets, main_sent) return best_answer