def answer_selection_by_attextract_TFIDF(seg_list, attri_value): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,计算每句话的相对于该段文本的TFIDF向量和,以问句分词作为词典,生成待匹配的句子的TFIDF句子向量,将句子向量求和,得到一个数,比较数字大小,排序,取最大 返回:得分最高的句子 设置阈值: 问题: 1)应该加入相似词的匹配机制 效果:不理想,因为seg_list中的分词太少,使得每个句子的权重都是1.0 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的所有属性的完整句子 :return: 得分最高的句子,具有的词数 ''' logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) seg_corpus = [' '.join(seg_list)] sentences = [str(sen.strip()) for sen in attri_value.split('。')] attri_coupus = [] for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] attri_coupus.append(' '.join(attri_wordlist)) #查看分词和匹配结果 logging.info("sentence: " + sentence + ", attri_wordlist: " + str('/'.join([str(seg) for seg in attri_wordlist]))) counter = CountVectorizer() counter.fit(seg_corpus) logging.info("countvectorizer words dict: ") logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False)) counts = counter.transform(attri_coupus) logging.info("one-hot vector: ") logging.info(counts.toarray()) tfidfer = TfidfTransformer() tfidf = tfidfer.fit_transform(counts) logging.info("TFIDF vector: ") logging.info(tfidf.toarray()) max_sentence = '' max_score = 0.0 for index, vector in enumerate(tfidf.toarray()): sum = 0.0 for num in vector: true_num = float('%.5f' % num) sum += true_num if max_score < sum: max_sentence = sentences[index] max_score = sum for head_word in delete_head_words: if head_word in max_sentence: max_sentence = max_sentence.replace(head_word, '') logging.info('answer:' + max_sentence) logging.info('point:') logging.info(max_score) return { 'answer': max_sentence.encode('unicode-escape'), 'point': max_score }
def answer_selection_by_strmatch_set(seg_list, attri_value): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,按相同词打的个数的多少进行分,选择多的,看一句话中重复的词的个数进行排序 规则:每个属性中按'。' 进行分句,每个属性中抽取一个答案,排序时,比较有的词的数目,多个相同词,按一个算 设置阈值: 问题:标点符号也进行了匹配 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :return: 得分最高的句子,具有的词数 ''' logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) sentences = [str(sen.strip()) for sen in attri_value.split('。')] max_sentence = '' max_score = 0 for sentence in sentences: common_words = set() if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] for q_word in seg_list: for a_word in attri_wordlist: if q_word == a_word: common_words.add(q_word) #查看分词和匹配结果 logging.info("sentence: " + sentence + ", attri_wordlist: " + str('/'.join([str(seg) for seg in attri_wordlist]))) logging.info("comman words num: " + str(len(common_words))) if max_score < len(common_words): max_score = len(common_words) max_sentence = sentence logging.info("max_sentence: " + max_sentence + ", max_score: " + str(max_score)) return { 'answer': max_sentence.encode('unicode-escape'), 'point': max_score * 0.1 }
def answer_selection_by_strmatch(seg_list, attri_value): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,按相同词打的个数的多少进行分,选择多的,看一句话中重复的词的个数进行排序 规则:每个属性中按'。' 进行分句,每个属性中抽取一个答案,排序时按出现的词的数目排序,多次出现相同词按多次记 设置阈值: 问题:不紧要的分词可能出现多次,影响精度 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :return: 得分最高的句子,具有的词数 ''' # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) sentences = [str(sen.strip()) for sen in attri_value.split('。')] max_sentence = '' max_score = 0 for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] count = 0 for q_word in seg_list: for a_word in attri_wordlist: if q_word == a_word: count += 1 #查看分词和匹配结果 # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist]))) # logging.info("comman words num: "+str(count)) if max_score < count: max_score = count max_sentence = sentence # logging.info("max_sentence: " + max_sentence + ", max_score: " + str(max_score)) return { 'answer': max_sentence.encode('unicode-escape'), 'point': max_score * 0.1 }
def answer_selection_by_attextract_TFIDF_allAttribute(seg_list, attri_value_list, answer_num=3, threshold=0.5): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。 设置阈值:分数排前三的三个句子 问题: 1)应该加入相似词的匹配机制 效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :param answer_num: 返回答案的数目,默认是3 :param threshold: 挑选答案的权重,默认是0.1 :return: 得分最高的句子,具有的词数 ''' id2sentences = {} sentence2id = {} sentences = [] orginal_sentence = ''.join([str(seg) for seg in seg_list]) t = -1 for m in range(len(time_list)): for time in time_list[m]: if orginal_sentence.find(time) != -1: t = m # t表示时序,t = 0 时,表示问之后的事情;t = 1 时,表示问之前的事情;t = 2 时,表示问现在的事情 break # print 't值为:',t id = 0 for i in range(len(attri_value_list)): id2sentences[id] = attri_value_list[i] id += 1 for i in range(len(id2sentences)): result = sequence_extract(id2sentences[i]) id2sentences[i] = result for i in id2sentences: for sentence in id2sentences[i]: sentence2id[sentence] = i sentences.append(sentence) logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) attri_coupus = [] for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] attri_coupus.append(' '.join(attri_wordlist)) #查看分词和匹配结果 counter = CountVectorizer(lowercase=False) counter.fit(attri_coupus) counts = counter.transform(attri_coupus) tfidfer = TfidfTransformer() tfidf = tfidfer.fit_transform(counts) word = counter.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 sentence_and_weight = [] for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 sentence_weight = 0.0 for j in range(len(word)): if word[j] in seg_list: #是我关心的词汇 sentence_weight += weight[i][j] sentence_and_weight.append((i, sentence_weight)) sentence_and_weight.sort(key=lambda x: x[1], reverse=True) #按照权重,从大到小排序 max_score = 0.0 result_sentence = '' if len(sentence_and_weight) >= answer_num: for i in range(answer_num): sentence = sentences[sentence_and_weight[i][0]] if t == 2: max_sentence = sentence elif t == 0: id = sentence2id[sentence] sentences_list = id2sentences[id] no = sentences_list.index(sentence) max_sentence = '' # for sen in sentences_list: # print sen, # print if no < (len(sentences_list)-1): for i in range(no+1,len(sentences_list)): max_sentence += sentences_list[i] # print 'step1:',max_sentence else: max_sentence = u'已经是办理该业务的最后一步!' elif t == 1 : id = sentence2id[sentence] sentences_list = id2sentences[id] no = sentences_list.index(sentence) if no != 0: max_sentence = '' for i in range(0,no): max_sentence += sentences_list[i] else: max_sentence = u'已经是办理该业务的第一步!' else: max_sentence = sentence # print 'step1.5:',max_sentence # print 'point:',sentence_and_weight[i][1] if sentence_and_weight[i][1] > threshold: # print 'step2:',max_sentence result_sentence += "候选答案"+": \t"+max_sentence+'\t\n' else: if len(sentence_and_weight) > 0: for i in range(len(sentence_and_weight)): sentence = sentences[sentence_and_weight[i][0]] if t == 2: max_sentence = sentence elif t == 0: id = sentence2id[sentence] sentences_list = id2sentences[id] no = sentences_list.index(sentence) if no < (len(sentences_list)-1): max_sentence = '' for i in range(no+1,len(sentences_list)): max_sentence += sentences_list[i] else: max_sentence = u'已经是办理该业务的最后一步!' elif t == 1 : id = sentence2id[sentence] sentences_list = id2sentences[id] no = sentences_list.index(sentence) if no != 0: max_sentence = '' for i in range(0,no): max_sentence += sentences_list[i] else: max_sentence = u'已经是办理该业务的第一步!' else: id = sentence2id[sentence] sentences_list = id2sentences[id] for sentence in sentences_list: max_sentence += sentence if sentence_and_weight[i][1] > threshold: result_sentence += "候选答案"+": \t"+max_sentence+'\t\n' if result_sentence != '' and len(result_sentence) != 0: for head_word in delete_head_words: if head_word in result_sentence: result_sentence = result_sentence.replace(head_word, '') max_score = sentence_and_weight[0][1] logging.info('answer:'+result_sentence) logging.info('point:') logging.info(max_score) # print 'answer:',result_sentence return {'answer':result_sentence.encode('unicode-escape'), 'point':max_score}
def knowledge_graph(question, neoid=None, autopick=False): #autopick表示是否开启自动选择 # 如果已经选好了实体,直接返回实体检索结果 if neoid is not None: return decorate(neoid, style='BASIC') question.strip() if any(num in question for num in num_list): switch = True else: switch = False for queryword in queryword_list: if queryword in question: question = question.replace(queryword, '') # 比较型问题 pattern = r'^.+比.+(高|低).*$' if re.search(pattern, question.decode('utf-8').encode('utf-8')) != None: seg_list = serviceQA.segment(question) seg_list_complete = [] for seg in seg_list: seg_list_complete.append(seg.word) relatedwords = [u'利率', u'产品利率', u'存款利率', u'贷款利率'] word_1, word_2 = '', '' for seg in seg_list_complete: if seg in namelist and seg_list_complete.index( seg) < seg_list_complete.index('比'): word_1 = seg continue if seg in namelist and seg_list_complete.index( seg) > seg_list_complete.index('比'): word_2 = seg break if len(owlNeo4j.get_entity_list_by_name(word_1)) > 0 and len( owlNeo4j.get_entity_list_by_name(word_2)) > 0: word_1 = owlNeo4j.get_entity_list_by_name(word_1)[0] word_2 = owlNeo4j.get_entity_list_by_name(word_2)[0] for word in relatedwords: if word in word_1 and word in word_2: return decorate(data='1', style='COM', question=question) #按类别查询 if 'c::' in question: category = question.split('c::')[1].strip() for node in kb: for tag in node['taglist'].split(','): score = owlNeo4j.entity_similarity(category, tag) if category == tag or score >= 0.5: return decorate('2', 'CAT', question=question) #按关系查询 if 'r::' in question: relation = question.split('r::')[1].strip() if relation.find('<') == -1: for link in links: score = serviceWord2vec.get_similarity( list(jieba.cut(relation)), list(jieba.cut(link['name']))) if relation == link['name'] or score >= 0.6: return decorate('3', 'LIN', question=question) else: return decorate('3', 'LIN', question=question) #归纳型问题 seg_list = serviceQA.segment(question) #seg_list_complete = [] for seg in seg_list: #seg_list_complete.append(seg.word) if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']: for seg in seg_list: if seg.word in catelist: for seg in seg_list: if seg.word in num_dict: return decorate('4', 'IND', question=question) #检索型问题 for seg in seg_list: if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']: for seg in seg_list: if seg.word in catelist: for seg in seg_list: if seg.word in [u'高于', u'低于', u'等于']: for seg in seg_list: if seg.flag == 'm': return decorate('5', 'RET', question=question) #流程性问题 pre = sequence_class.question_class(question) if pre == 1: result = serviceQA.autoseq(question) if result != 0: return decorate(result, style='QA') # 进行中文问答 qa_result = serviceQA.chinese_qa(question, switch) logging.info("qa_result:" + json.dumps(qa_result, encoding='utf-8', ensure_ascii=False)) if (qa_result is None): return None # 如果是实体检索 if 'question' in qa_result: # 如果存在(实体,关系)对的相似问题 return decorate(qa_result['question'], style='QUE') if len(qa_result['path']) == 0: # 如果path为空,即不存在关系 if autopick or (len(qa_result['ents']) == 1): # 如果开启自动选择或只存在一个实体 return decorate(qa_result['ents'][0]['neoId'], style='BASIC') else: # 如果存在多个实体且没开启自动选择 return decorate(qa_result['ents'], style='SNET') else: if qa_result['ents'][0]['neoId'] == None: return decorate(qa_result, style='TS') # 全文信息检索 return decorate(qa_result, style='QA') # 从属性里找答案,或者有匹配的(实体,属性,实体)
def answer_selection_by_TFIDF_allAttribute_word2vec_hasmostword( seg_list, attri_value_list, answer_num=3, tfidf_threshold=0.5, w2v_threshold=0.7, w2v_sub_threshold=0.85): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,不存在的取大于阈值w2v_threshold的最相似词, 存在的也取大于阈值w2v_sub_threshold的最相似的分词,将各句子的分词tfidf值与相似度权重相乘再相加,作为这个句子的最终的权重。 再在对句子排序时,优先包含出现分词多的句子 与方法:answer_selection_by_attextract_TFIDF_allAttribute的区别是:进行了改进,在句子排序时,优先出现分词多的句子。 设置阈值:分数排前三的三个句子 问题: :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :param answer_num: 返回答案的数目,默认是3 :param tfidf_threshold: TIIDF挑选答案的权重,默认是0.5 :param w2v_threshold=0.7: 相对于问句中的分词,属性文本中缺失的分词的word2vec最相似词的最低匹配阈值,默认是0.7 :param w2v_sub_threshold=0.85: 相对于问句中的分词,属性文本中已存在的分词的word2vec最相似词的最低匹配阈值,默认是0.85 :return: 得分最高的句子,具有的词数 ''' # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) sentences = [] for attri_value in attri_value_list: sentences += [str(sen.strip()) for sen in attri_value.split('。')] for sentence in sentences: if sentence == '' or len(sentence) == 0: sentences.remove(sentence) attri_coupus = [] for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] attri_coupus.append(' '.join(attri_wordlist)) #查看分词和匹配结果 logging.info("sentence: " + sentence + ", attri_wordlist: " + str('/'.join([str(seg) for seg in attri_wordlist]))) try: counter = CountVectorizer(lowercase=False) counter.fit(attri_coupus) # logging.info("countvectorizer words dict: ") # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False)) counts = counter.transform(attri_coupus) # logging.info("one-hot vector: ") # logging.info(counts.toarray()) tfidfer = TfidfTransformer() tfidf = tfidfer.fit_transform(counts) # logging.info("TFIDF vector: ") # logging.info(tfidf.toarray()) words = counter.get_feature_names() # 获取词袋模型中的所有词语 all_fit_wrods = [] # 属性句子中所有要抽取相加的词,包括与问句完全匹配的分词和word2vec匹配最高的词 all_fit_segs = [] # words中包含的seg for seg in seg_list: if seg in words: #语料库中包含问题分词 all_fit_wrods.append([seg, 1, words.index(seg)]) all_fit_segs.append([seg, words.index(seg)]) max_fit_word = ['', w2v_sub_threshold, 0] # 最匹配的词,[0]存词语,[1]存相似度, [2]存分词在语料词典中的位置 for index, word in enumerate(words): if word != seg: try: if seg not in config.w2v_model: break if word not in config.w2v_model: continue word_similarity = config.w2v_model.similarity( seg, word) except KeyError: # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!") continue if word_similarity > max_fit_word[1]: max_fit_word[0] = word max_fit_word[1] = word_similarity max_fit_word[2] = index if max_fit_word[1] > w2v_sub_threshold: logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' + max_fit_word[0] + " ,similarity:") logging.info(max_fit_word) all_fit_wrods.append(max_fit_word) else: max_fit_word = ['', w2v_threshold, 0] # 最匹配的词,[0]存词语,[1]存相似度, [2]存分词在语料词典中的位置 for index, word in enumerate(words): try: if seg not in config.w2v_model: break if word not in config.w2v_model: continue word_similarity = config.w2v_model.similarity( seg, word) except KeyError: # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!") continue if word_similarity > max_fit_word[1]: max_fit_word[0] = word max_fit_word[1] = word_similarity max_fit_word[2] = index if max_fit_word[1] > w2v_threshold: logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' + max_fit_word[0] + " ,similarity:") logging.info(max_fit_word) all_fit_wrods.append(max_fit_word) weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 sentence_and_weight = [] for i in range(len( weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 sentence_weight = 0.0 seg_exist_num = 0 for j in all_fit_wrods: #计算TFIDF权重 sentence_weight += weight[i][j[2]] * j[1] if sentence_weight > tfidf_threshold: for seg_info in all_fit_segs: if weight[i][seg_info[1]] > 0: seg_exist_num += 1 sentence_and_weight.append( (i, seg_exist_num, sentence_weight)) #0:句子索引,1:句子中包含的问句分词数,2:句子的TFIDF权重 sentence_and_weight.sort(key=lambda x: x[1] or x[2], reverse=True) #排序:先按句子包含的分词数,再按照权重,从大到小排序 # logging.info(sentence_and_weight) max_score = 0.0 result_sentence = '' result_answer_num = answer_num if len( sentence_and_weight) > answer_num else len(sentence_and_weight) for i in range(result_answer_num): result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[ sentence_and_weight[i][0]] + ';\t\n' if result_sentence != '' and len(result_sentence) != 0: for head_word in delete_head_words: if head_word in result_sentence: result_sentence = result_sentence.replace(head_word, '') max_score = sentence_and_weight[0][1] else: return None # logging.info('answer:'+result_sentence) # logging.info('point:' + str(max_score)) except Exception as e: logging.info(u'发生异常:', Exception) return None return { 'answer': result_sentence.encode('unicode-escape'), 'point': max_score }
def answer_selection_by_attextract_TFIDF_allAttribute_word2vec( seg_list, attri_value_list, answer_num=3, tfidf_threshold=0.5, w2v_threshold=0.7): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。 与方法:answer_selection_by_attextract_TFIDF_allAttribute的区别是:进行了改进,使用提前训练好的word2vec向量进行关键字的匹配,对问句中有而匹配问句文本语料中没有的关键字, 选取wrod2vec向量最相近的关键字,通过阈值:w2v_threshold控制匹配的精度 设置阈值:分数排前三的三个句子 问题: 1)只对语料中没有的问句分词进行Word2vec匹配,对已有的分词也可以匹配,赋以较低的权重——通过训练能够获得最佳的参数值,比自己凭经验设置要智能和准确 效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :param answer_num: 返回答案的数目,默认是3 :param tfidf_threshold: TIIDF挑选答案的权重,默认是0.5 :param w2v_threshold=0.8: 相对于问句中的分词,属性文本中缺失的分词的word2vec最相似词的最低匹配阈值,默认是0.7 :return: 得分最高的句子,具有的词数 ''' # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) sentences = [] for attri_value in attri_value_list: sentences += [str(sen.strip()) for sen in attri_value.split('。')] for sentence in sentences: if sentence == '' or len(sentence) == 0: sentences.remove(sentence) attri_coupus = [] for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] attri_coupus.append(' '.join(attri_wordlist)) #查看分词和匹配结果 # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist]))) counter = CountVectorizer(lowercase=False) counter.fit(attri_coupus) # logging.info("countvectorizer words dict: ") # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False)) counts = counter.transform(attri_coupus) # logging.info("one-hot vector: ") # logging.info(counts.toarray()) tfidfer = TfidfTransformer() tfidf = tfidfer.fit_transform(counts) # logging.info("TFIDF vector: ") # logging.info(tfidf.toarray()) words = counter.get_feature_names() # 获取词袋模型中的所有词语 all_fit_wrods = [] # 属性句子中所有要抽取相加的词,包括与问句完全匹配的分词和word2vec匹配最高的词 for seg in seg_list: if seg in words: all_fit_wrods.append(seg) else: max_fit_word = ['', w2v_threshold] # 最匹配的词,[0]存词语,[1]存相似度 for word in words: if word not in all_fit_wrods: try: word_similarity = config.w2v_model.similarity( seg, word) except KeyError: # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!") continue if word_similarity > max_fit_word[1]: max_fit_word[0] = word max_fit_word[1] = word_similarity if max_fit_word[1] > w2v_threshold: logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' + max_fit_word[0] + " ,similarity:") logging.info(max_fit_word) all_fit_wrods.append(max_fit_word[0]) fit_words_postions = [] #匹配的分词在语料库中的位置下标 for index, word in enumerate(words): if len(fit_words_postions) == len(all_fit_wrods): break else: if word in all_fit_wrods: fit_words_postions.append(index) weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 sentence_and_weight = [] for i in range( len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 sentence_weight = 0.0 for j in fit_words_postions: sentence_weight += weight[i][j] if sentence_weight > tfidf_threshold: sentence_and_weight.append((i, sentence_weight)) sentence_and_weight.sort(key=lambda x: x[1], reverse=True) #按照权重,从大到小排序 # logging.info(sentence_and_weight) max_score = 0.0 result_sentence = '' result_answer_num = answer_num if len( sentence_and_weight) > answer_num else len(sentence_and_weight) for i in range(result_answer_num): result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[ sentence_and_weight[i][0]] + ';\t\n' if result_sentence != '' and len(result_sentence) != 0: for head_word in delete_head_words: if head_word in result_sentence: result_sentence = result_sentence.replace(head_word, '') max_score = sentence_and_weight[0][1] # logging.info('answer:'+result_sentence) # logging.info('point:' + str(max_score)) return { 'answer': result_sentence.encode('unicode-escape'), 'point': max_score }
def answer_selection_by_attextract_TFIDF_allAttribute(seg_list, attri_value_list, answer_num=3, threshold=0.1): ''' description:从属性中抽取与问题描述最相似的句子作为答案。 处理思路:字符串匹配,一次性将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。 设置阈值:分数排前三的三个句子 问题: 1)应该加入相似词的匹配机制 效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高 :param seg_list:提问的问句分词结果 :param attri_value: 要寻找答案的属性完整句子 :param answer_num: 返回答案的数目,默认是3 :param threshold: 挑选答案的权重,默认是0.1 :return: 得分最高的句子,具有的词数 ''' logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list]))) sentences = [] for attri_value in attri_value_list: sentences += [str(sen.strip()) for sen in attri_value.split('。')] for sentence in sentences: if sentence == '' or len(sentence) == 0: sentences.remove(sentence) attri_coupus = [] for sentence in sentences: if sentence != '' and len(sentence) != 0: all_attri_wordlist = serviceQA.segment(sentence) attri_wordlist = [str(word.word) for word in all_attri_wordlist] attri_coupus.append(' '.join(attri_wordlist)) #查看分词和匹配结果 # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist]))) counter = CountVectorizer(lowercase=False) counter.fit(attri_coupus) # logging.info("countvectorizer words dict: ") # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False)) counts = counter.transform(attri_coupus) # logging.info("one-hot vector: ") # logging.info(counts.toarray()) tfidfer = TfidfTransformer() tfidf = tfidfer.fit_transform(counts) # logging.info("TFIDF vector: ") # logging.info(tfidf.toarray()) word = counter.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 sentence_and_weight = [] for i in range( len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 sentence_weight = 0.0 for j in range(len(word)): if word[j] in seg_list: #是我关心的词汇 sentence_weight += weight[i][j] sentence_and_weight.append((i, sentence_weight)) sentence_and_weight.sort(key=lambda x: x[1], reverse=True) #按照权重,从大到小排序 # logging.info(sentence_and_weight) max_score = 0.0 result_sentence = '' if len(sentence_and_weight) >= answer_num: for i in range(answer_num): if sentence_and_weight[i][1] > threshold: result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[ sentence_and_weight[i][0]] + ';\t\n' else: if len(sentence_and_weight) > 0: for i in range(len(sentence_and_weight)): if sentence_and_weight[i][1] > threshold: result_sentence += "候选答案" + str( i + 1) + ": \t" + sentences[sentence_and_weight[i] [0]] + ';\t\n' if result_sentence != '' and len(result_sentence) != 0: for head_word in delete_head_words: if head_word in result_sentence: result_sentence = result_sentence.replace(head_word, '') max_score = sentence_and_weight[0][1] logging.info('answer:' + result_sentence) logging.info('point:' + str(max_score)) return { 'answer': result_sentence.encode('unicode-escape'), 'point': max_score }