def siphon_best_match_from_set(sentence, set):
    corpora_documents = []
    for item_text in set:
        item_seg = list(jieba_nlp.generate_jieba_cut(item_text))
        corpora_documents.append(item_seg)
    dictionary = corpora.Dictionary(corpora_documents)
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    similarity = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index',
        corpus_tfidf,
        num_features=600)
    test_cut_raw_1 = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus_1 = dictionary.doc2bow(test_cut_raw_1)
    similarity.num_best = 2
    test_corpus_tfidf_1 = tfidf_model[test_corpus_1]
    tfidf_simi = similarity[test_corpus_tfidf_1]
    lsi = models.LsiModel(corpus_tfidf)
    corpus_lsi = lsi[corpus_tfidf]
    similarity_lsi = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index',
        corpus_lsi,
        num_features=400,
        num_best=2)
    test_cut_raw_3 = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus_3 = dictionary.doc2bow(test_cut_raw_3)
    test_corpus_tfidf_3 = tfidf_model[test_corpus_3]
    test_corpus_lsi_3 = lsi[test_corpus_tfidf_3]
    # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值
    lsi_simi = similarity_lsi[test_corpus_lsi_3]
    return {'tfidf': tfidf_simi, 'lsi_simi': lsi_simi}
def gensim_tfidf_simi(sentence, features=400, best=2):
    """
    文本相似度匹配,存在未数据未对齐异常
    :param sentence:
    :param features:
    :param best:
    :return:
    """
    rows = decode_rows_pickle()
    corpora_documents = []
    for row in rows:
        item = list(jieba_nlp.generate_jieba_cut(row))
        corpora_documents.append(item)
    dictionary = corpora.Dictionary(corpora_documents)
    dictionary.compactify()
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    tfidf_model = models.TfidfModel(corpus)
    corpus_tfidf = tfidf_model[corpus]
    similarity = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index',
        corpus_tfidf,
        num_features=features,
        num_best=best)
    cut_raw = list(jieba_nlp.generate_jieba_cut(sentence))
    test_corpus = dictionary.doc2bow(cut_raw)
    test_corpus_tfidf = tfidf_model[test_corpus]
    return similarity[test_corpus_tfidf]
Example #3
0
def generate_answer_by_mode(question, keywords, mode='jieba'):
    if keywords == question and len(question) > 5:
        question_auto = generate_qa_data_by_question(question)
    else:
        question_auto = generate_qa_data_by_question(keywords, True)
    # 如果查询不到结果
    if question_auto.count() < 1:
        question_auto = generate_qa_data_by_question(question, True)
    # 问答相似度计算
    question_origin = question
    if mode == 'ltp':
        ques = ltp_nlp.generate_segment_after_remove_stop_words(question)
        ques = remove_special_tags(ques)
    elif mode == 'jieba':
        ques = jieba_nlp.generate_jieba_cut(question)
        ques = remove_special_tags(ques)
    # ques = model.nlp_jieba_model.generate_jieba_cut(question)
    # ques = remove_special_tags(ques)
    for i in range(question_auto.count()):
        if question_auto[i].question:
            if mode == 'ltp':
                ques_ = ltp_nlp.generate_segment_after_remove_stop_words(
                    question_auto[i].question)
                ques_ = remove_special_tags(ques_)
            elif mode == 'jieba':
                ques_ = jieba_nlp.generate_jieba_cut(question_auto[i].question)
                ques_ = remove_special_tags(ques_)
            # ques_ = model.nlp_jieba_model.generate_jieba_cut(question_auto[i].question)
            # ques_ = remove_special_tags(ques_)
            # print(ques_)
            try:
                simi = agwv.generate_sets_simi(ques, ques_)
            # simi = model.generate_word_vector.generate_sets_simi_by_self(ques, ques_)
            except Exception as error:
                # print(error)
                # words = str(error).split('\'')
                if question_origin == question_auto[i].question:
                    question_auto[i].simi = '1'
                else:
                    question_auto[i].simi = '0'
                continue
            # raise Exception("Exception:", error)
            else:
                question_auto[i].simi = str(simi)[0:5]
                del simi
                del ques_
    agwv.clear_mem()
    question_auto = sorted(question_auto, key=lambda qa: qa.simi, reverse=True)
    return question_auto
def simlarity_tfidf(sentence):
    fenci = list(jieba_nlp.generate_jieba_cut(sentence))
    dictionary = corpora.Dictionary.load(BASE_DIR +
                                         '/vendor/dataset/gensim/dict.txt')
    corpus = corpora.MmCorpus(BASE_DIR +
                              '/vendor/dataset/gensim/corpuse.mm')  # 加载
    tfidf = models.TfidfModel.load(BASE_DIR +
                                   "/vendor/dataset/gensim/data.tfidf")
    corpus_tfidf = tfidf[corpus]
    vec_bow = dictionary.doc2bow(fenci)
    vec_tfidf = tfidf[vec_bow]
    index = similarities.MatrixSimilarity(corpus_tfidf)
    sims = index[vec_tfidf]
    similarity = list(sims)
    simi_sets = {}
    if similarity:
        rows = decode_rows_pickle()
        simi_sets = {}
        i = 0
        temp_simi = 0
        for simi in similarity:
            if simi > temp_simi:
                temp = {'index': i, 'simi': simi, 'text': rows[i]}
                simi_sets.update(temp)
                temp_simi = simi
            i += 1
    return simi_sets
Example #5
0
def siphon_sentence_patial(sent):
    """
    句子成分抽取
    :param sent:
    :return:
    """
    if sent:
        segment = jieba_model.generate_jieba_cut(sent)
        print(segment)
        segment = ' '.join(segment)
        ner = stanford_model.generate_ner(segment)
    return ner
def siphon_words_with_tags(sentence=False, keywords=None, tags=None):
    """
    词性标注,输出dict
    :param sentence:
    :param keywords:
    :param tags:
    :return:
    """
    word_dict = {}
    if sentence is False and keywords:
        if isinstance(keywords, list):
            keywords = ' '.join(keywords)
        if tags is None:
            tags = generate_postag(keywords)
        if tags:
            word_tags = tags.split()
            i = 0
            for word_tag in word_tags:
                if word_tag.find("/") != -1:
                    temp = word_tag.split("/")
                    if temp[0] in word_dict:
                        i += 1
                        if temp[0] + temp[1] in word_dict:
                            word_dict[temp[0] + temp[1] * i] = temp[1]
                        else:
                            word_dict[temp[0] + temp[1]] = temp[1]
                    else:
                        word_dict[temp[0]] = temp[1]
    elif sentence and keywords is False:
        keywords = jieba_nlp.generate_jieba_cut(sentence, False, True)
        keywords = ' '.join(keywords)
        tags = generate_postag(keywords)
        if tags:
            word_tags = tags.split()
            i = 0
            for word_tag in word_tags:
                if word_tag.find("/") != -1:
                    temp = word_tag.split("/")
                    if temp[0] in word_dict:
                        i += 1
                        if temp[0] + temp[1] in word_dict:
                            word_dict[temp[0] + temp[1] * i] = temp[1]
                        else:
                            word_dict[temp[0] + temp[1]] = temp[1]
                    else:
                        word_dict[temp[0]] = temp[1]
    return word_dict
def init_corpus_model(rows):
    corpora_documents = []
    for row in rows:
        item = list(jieba_nlp.generate_jieba_cut(row))
        corpora_documents.append(item)
    dictionary = corpora.Dictionary(corpora_documents)
    # save dictionary
    dictionary.save(BASE_DIR + '/vendor/dataset/gensim/dict.txt')
    corpus = [dictionary.doc2bow(text) for text in corpora_documents]
    # save corpus
    corpora.MmCorpus.serialize(BASE_DIR + '/vendor/dataset/gensim/corpuse.mm',
                               corpus)
    tfidf_model = models.TfidfModel(corpus)
    # save model
    tfidf_model.save(BASE_DIR + "/vendor/dataset/gensim/data.tfidf")
    corpus_tfidf = tfidf_model[corpus]
    return corpus_tfidf
def batch_split_with_jieba(file_path, output=None):
    f1 = open(file_path, encoding="utf8")
    if output is None:
        output = file_path + '.jieba.txt'
    f2 = open(output, 'a', encoding="utf8")
    lines = f1.readlines()
    for line in lines:
        if line.strip() == '':
            continue
        line.replace('\t', '').replace('\n', '').replace(' ', '')
        seg_list = jieba_nlp.generate_jieba_cut(line, False, False)
        str_ = " ".join(seg_list)
        str_ = str_.replace(':', '').replace(':', '').replace('、', '').replace(
            '。',
            '').replace(',', '').replace('(', '').replace(')', '').replace(
                ';', '').replace('《', '').replace('》', '').replace('/', '')
        f2.write(str_)
    f1.close()
    f2.close()
Example #9
0
def siphon_words_with_tags(sentence, keywords=None):
    word_dict = {}
    if keywords is None:
        keywords_ = jieba_nlp.generate_jieba_cut(sentence, False, True)
        keywords_ = " ".join(keywords_)
        keywords = keywords_.split()
    postag = generate_postag(sentence, keywords)
    i = 0
    for keyword, pos in zip(keywords, postag):
        if keyword and pos:
            keyword = chinese_grammar.predicate_transfer(keyword)
            if keyword in word_dict:
                i += 1
                if keyword + pos in word_dict:
                    word_dict[keyword + pos * i] = pos
                else:
                    word_dict[keyword + pos] = pos
            else:
                word_dict[keyword] = pos
    return word_dict
def gensim_lsi_simi(sentence, features=600, best=2):
    """
    文本相似度匹配,支持大数据
    :param sentence:
    :param features:
    :param best:
    :return:
    """
    dictionary = corpora.Dictionary.load(BASE_DIR +
                                         '/vendor/dataset/gensim/dict.txt')
    corpus = corpora.MmCorpus(BASE_DIR + '/vendor/dataset/gensim/corpuse.mm')
    tfidf_model = models.TfidfModel.load(BASE_DIR +
                                         "/vendor/dataset/gensim/data.tfidf")
    corpus_tfidf = tfidf_model[corpus]
    lsi = models.LsiModel(corpus_tfidf)
    corpus_lsi = lsi[corpus_tfidf]
    similarity_lsi = similarities.Similarity(
        BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index',
        corpus_lsi,
        num_features=features,
        num_best=best)
    test_cut_raw = list(jieba_nlp.generate_jieba_cut(sentence))  # 1.分词
    test_corpus = dictionary.doc2bow(test_cut_raw)  # 2.转换成bow向量
    test_corpus_tfidf = tfidf_model[test_corpus]  # 3.计算tfidf值
    test_corpus_lsi = lsi[test_corpus_tfidf]  # 4.计算lsi值
    # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值
    simi_sets = similarity_lsi[test_corpus_lsi]
    reponse = {}
    if simi_sets:
        rows = decode_rows_pickle()
        temp_simi = 0
        for simi_set in simi_sets:
            if simi_set[1] > temp_simi:
                reponse['index'] = simi_set[0]
                reponse['simi'] = simi_set[1]
                reponse['text'] = rows[simi_set[0]]
                temp_simi = simi_set[1]
    return reponse
Example #11
0
def generate_response(main_sent, keywords, ners, relations, type):
    """
    答案抽取
    :param main_sent:
    :param keywords:
    :param ners:
    :param relations:
    :param type:
    :return:
    第一步:FAQ主题词精确匹配
    第二步:匹配中的候选问答评分
    若
    """
    main_keywords_set = ''
    best_answer = ''
    if main_sent:
        main_keywords_set = jieba_nlp.generate_jieba_cut(main_sent)
        main_keywords_set = ' '.join(main_keywords_set)
        main_keywords_set = main_keywords_set.split()
        synonyms_words = mgaics.siphon_synonyms_words(main_keywords_set)
    if keywords:
        like_is = True
        if main_sent == keywords:
            like_is = False
        sets = fetch_sets_by_words(keywords, like_is)
        if sets:
            for i in range(len(list(sets))):
                if sets[i].question:
                    ques_ = mgaics.remove_partial_and_special(sets[i].question)
                    try:
                        ques_keywords_set = jieba_nlp.generate_jieba_cut(ques_)
                        ques_keywords_set = ' '.join(ques_keywords_set)
                        ques_keywords_set = ques_keywords_set.split()
                        ques_keywords_set = mgaics.replace_synonyms_words(
                            main_keywords_set, ques_keywords_set,
                            synonyms_words)
                        simi = agwv.generate_sets_simi(ques_keywords_set,
                                                       main_keywords_set)
                    except Exception as error:
                        if main_sent == ques_:
                            sets[i].simi = '1'
                        else:
                            sets[i].simi = '0'
                        continue
                    else:
                        sets[i].simi = str(simi)[0:5]
                        del simi
                        del ques_
                        agwv.clear_mem()
                        # print(sets[i].question, sets[i].simi)
            agwv.clear_mem()
            sets = sorted(sets, key=lambda qa: qa.simi, reverse=True)
            best_answer = generate_best_answer(sets, main_sent)
    if best_answer is None and ners:
        sets = fetch_sets_by_words(ners, False)
        if sets:
            for i in range(len(list(sets))):
                if sets[i].question:
                    ques_ = mgaics.remove_partial_and_special(sets[i].question)
                    try:
                        ques_keywords_set = jieba_nlp.generate_jieba_cut(ques_)
                        ques_keywords_set = ' '.join(ques_keywords_set)
                        ques_keywords_set = ques_keywords_set.split()
                        ques_keywords_set = mgaics.replace_synonyms_words(
                            main_keywords_set, ques_keywords_set,
                            synonyms_words)
                        simi = agwv.generate_sets_simi(ques_keywords_set,
                                                       main_keywords_set)
                    except Exception as error:
                        if main_sent == ques_:
                            sets[i].simi = '1'
                        else:
                            sets[i].simi = '0'
                        continue
                    else:
                        sets[i].simi = str(simi)[0:5]
                        del simi
                        del ques_
                        agwv.clear_mem()
                        # print(sets[i].question, sets[i].simi)
            agwv.clear_mem()
            sets = sorted(sets, key=lambda qa: qa.simi, reverse=True)
            best_answer = generate_best_answer(sets, main_sent)
    return best_answer