Example #1
0
def answer_selection_by_attextract_TFIDF(seg_list, attri_value):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,计算每句话的相对于该段文本的TFIDF向量和,以问句分词作为词典,生成待匹配的句子的TFIDF句子向量,将句子向量求和,得到一个数,比较数字大小,排序,取最大
    返回:得分最高的句子
    设置阈值:
    问题:
    1)应该加入相似词的匹配机制
    效果:不理想,因为seg_list中的分词太少,使得每个句子的权重都是1.0
    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的所有属性的完整句子
    :return: 得分最高的句子,具有的词数
    '''
    logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    seg_corpus = [' '.join(seg_list)]
    sentences = [str(sen.strip()) for sen in attri_value.split('。')]
    attri_coupus = []
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            attri_coupus.append(' '.join(attri_wordlist))
            #查看分词和匹配结果
            logging.info("sentence: " + sentence + ", attri_wordlist: " +
                         str('/'.join([str(seg) for seg in attri_wordlist])))
    counter = CountVectorizer()
    counter.fit(seg_corpus)
    logging.info("countvectorizer words dict: ")
    logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False))
    counts = counter.transform(attri_coupus)
    logging.info("one-hot vector: ")
    logging.info(counts.toarray())
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    logging.info("TFIDF vector: ")
    logging.info(tfidf.toarray())
    max_sentence = ''
    max_score = 0.0
    for index, vector in enumerate(tfidf.toarray()):
        sum = 0.0
        for num in vector:
            true_num = float('%.5f' % num)
            sum += true_num
        if max_score < sum:
            max_sentence = sentences[index]
            max_score = sum
    for head_word in delete_head_words:
        if head_word in max_sentence:
            max_sentence = max_sentence.replace(head_word, '')
    logging.info('answer:' + max_sentence)
    logging.info('point:')
    logging.info(max_score)
    return {
        'answer': max_sentence.encode('unicode-escape'),
        'point': max_score
    }
Example #2
0
def answer_selection_by_strmatch_set(seg_list, attri_value):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,按相同词打的个数的多少进行分,选择多的,看一句话中重复的词的个数进行排序
    规则:每个属性中按'。' 进行分句,每个属性中抽取一个答案,排序时,比较有的词的数目,多个相同词,按一个算
    设置阈值:
    问题:标点符号也进行了匹配

    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :return: 得分最高的句子,具有的词数
    '''
    logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    sentences = [str(sen.strip()) for sen in attri_value.split('。')]
    max_sentence = ''
    max_score = 0
    for sentence in sentences:
        common_words = set()
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            for q_word in seg_list:
                for a_word in attri_wordlist:
                    if q_word == a_word:
                        common_words.add(q_word)
            #查看分词和匹配结果
            logging.info("sentence: " + sentence + ", attri_wordlist: " +
                         str('/'.join([str(seg) for seg in attri_wordlist])))
            logging.info("comman words num: " + str(len(common_words)))
            if max_score < len(common_words):
                max_score = len(common_words)
                max_sentence = sentence
    logging.info("max_sentence: " + max_sentence + ", max_score: " +
                 str(max_score))
    return {
        'answer': max_sentence.encode('unicode-escape'),
        'point': max_score * 0.1
    }
Example #3
0
def answer_selection_by_strmatch(seg_list, attri_value):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,按相同词打的个数的多少进行分,选择多的,看一句话中重复的词的个数进行排序
    规则:每个属性中按'。' 进行分句,每个属性中抽取一个答案,排序时按出现的词的数目排序,多次出现相同词按多次记
    设置阈值:
    问题:不紧要的分词可能出现多次,影响精度

    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :return: 得分最高的句子,具有的词数
    '''
    # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    sentences = [str(sen.strip()) for sen in attri_value.split('。')]
    max_sentence = ''
    max_score = 0
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            count = 0
            for q_word in seg_list:
                for a_word in attri_wordlist:
                    if q_word == a_word:
                        count += 1
            #查看分词和匹配结果
            # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist])))
            # logging.info("comman words num: "+str(count))
            if max_score < count:
                max_score = count
                max_sentence = sentence
    # logging.info("max_sentence: " + max_sentence + ", max_score: " + str(max_score))
    return {
        'answer': max_sentence.encode('unicode-escape'),
        'point': max_score * 0.1
    }
Example #4
0
def answer_selection_by_attextract_TFIDF_allAttribute(seg_list, attri_value_list, answer_num=3, threshold=0.5):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。
    设置阈值:分数排前三的三个句子
    问题:
    1)应该加入相似词的匹配机制
    效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高
    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :param answer_num: 返回答案的数目,默认是3
    :param threshold: 挑选答案的权重,默认是0.1
    :return: 得分最高的句子,具有的词数
    '''
    id2sentences = {}
    sentence2id = {}
    sentences = []
    orginal_sentence = ''.join([str(seg) for seg in seg_list])
    t = -1
    for m in range(len(time_list)):
        for time in time_list[m]:
            if orginal_sentence.find(time) != -1:
                t = m  # t表示时序,t = 0 时,表示问之后的事情;t = 1 时,表示问之前的事情;t = 2 时,表示问现在的事情
                break
    # print 't值为:',t
    id = 0
    for i in range(len(attri_value_list)):
        id2sentences[id] = attri_value_list[i]
        id += 1
    for i in range(len(id2sentences)):
        result = sequence_extract(id2sentences[i])
        id2sentences[i] = result
    for i in id2sentences:
        for sentence in id2sentences[i]:
            sentence2id[sentence] = i
            sentences.append(sentence)
    logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    attri_coupus = []
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            attri_coupus.append(' '.join(attri_wordlist))
            #查看分词和匹配结果
    counter = CountVectorizer(lowercase=False)
    counter.fit(attri_coupus)
    counts = counter.transform(attri_coupus)
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    word = counter.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    sentence_and_weight = []
    for i in range(len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        sentence_weight = 0.0
        for j in range(len(word)):
            if word[j] in seg_list: #是我关心的词汇
                sentence_weight += weight[i][j]
        sentence_and_weight.append((i, sentence_weight))
    sentence_and_weight.sort(key=lambda x: x[1], reverse=True) #按照权重,从大到小排序
    max_score = 0.0
    result_sentence = ''
    if len(sentence_and_weight) >= answer_num:
        for i in range(answer_num):
              sentence = sentences[sentence_and_weight[i][0]]
              if t == 2:
                    max_sentence = sentence
              elif t == 0:
                  id = sentence2id[sentence]
                  sentences_list = id2sentences[id]
                  no = sentences_list.index(sentence)
                  max_sentence = ''
                  # for sen in sentences_list:
                  #     print sen,
                  #     print
                  if no < (len(sentences_list)-1):
                      for i in range(no+1,len(sentences_list)):
                             max_sentence += sentences_list[i]
                      # print 'step1:',max_sentence
                  else:
                      max_sentence = u'已经是办理该业务的最后一步!'
              elif t == 1 :
                  id = sentence2id[sentence]
                  sentences_list = id2sentences[id]
                  no = sentences_list.index(sentence)
                  if no != 0:
                      max_sentence = ''
                      for i in range(0,no):
                         max_sentence += sentences_list[i]
                  else:
                      max_sentence = u'已经是办理该业务的第一步!'
              else:
                  max_sentence = sentence
              # print 'step1.5:',max_sentence
              # print 'point:',sentence_and_weight[i][1]
              if sentence_and_weight[i][1] > threshold:
                 # print 'step2:',max_sentence
                 result_sentence += "候选答案"+": \t"+max_sentence+'\t\n'
    else:
        if len(sentence_and_weight) > 0:
            for i in range(len(sentence_and_weight)):
              sentence = sentences[sentence_and_weight[i][0]]
              if t == 2:
                    max_sentence = sentence
              elif t == 0:
                  id = sentence2id[sentence]
                  sentences_list = id2sentences[id]
                  no = sentences_list.index(sentence)
                  if no < (len(sentences_list)-1):
                      max_sentence = ''
                      for i in range(no+1,len(sentences_list)):
                             max_sentence += sentences_list[i]
                  else:
                      max_sentence = u'已经是办理该业务的最后一步!'
              elif t == 1 :
                  id = sentence2id[sentence]
                  sentences_list = id2sentences[id]
                  no = sentences_list.index(sentence)
                  if no != 0:
                      max_sentence = ''
                      for i in range(0,no):
                         max_sentence += sentences_list[i]
                  else:
                      max_sentence = u'已经是办理该业务的第一步!'
              else:
                  id = sentence2id[sentence]
                  sentences_list = id2sentences[id]
                  for sentence in sentences_list:
                      max_sentence += sentence
              if sentence_and_weight[i][1] > threshold:
                  result_sentence += "候选答案"+": \t"+max_sentence+'\t\n'
    if result_sentence != '' and len(result_sentence) != 0:
        for head_word in delete_head_words:
            if head_word in result_sentence:
                result_sentence = result_sentence.replace(head_word, '')
        max_score = sentence_and_weight[0][1]
    logging.info('answer:'+result_sentence)
    logging.info('point:')
    logging.info(max_score)
    # print 'answer:',result_sentence
    return {'answer':result_sentence.encode('unicode-escape'), 'point':max_score}
Example #5
0
def knowledge_graph(question, neoid=None, autopick=False):  #autopick表示是否开启自动选择
    # 如果已经选好了实体,直接返回实体检索结果
    if neoid is not None:
        return decorate(neoid, style='BASIC')
    question.strip()
    if any(num in question for num in num_list):
        switch = True
    else:
        switch = False
    for queryword in queryword_list:
        if queryword in question:
            question = question.replace(queryword, '')
    # 比较型问题
    pattern = r'^.+比.+(高|低).*$'
    if re.search(pattern, question.decode('utf-8').encode('utf-8')) != None:
        seg_list = serviceQA.segment(question)
        seg_list_complete = []
        for seg in seg_list:
            seg_list_complete.append(seg.word)
        relatedwords = [u'利率', u'产品利率', u'存款利率', u'贷款利率']
        word_1, word_2 = '', ''
        for seg in seg_list_complete:
            if seg in namelist and seg_list_complete.index(
                    seg) < seg_list_complete.index('比'):
                word_1 = seg
                continue
            if seg in namelist and seg_list_complete.index(
                    seg) > seg_list_complete.index('比'):
                word_2 = seg
                break
        if len(owlNeo4j.get_entity_list_by_name(word_1)) > 0 and len(
                owlNeo4j.get_entity_list_by_name(word_2)) > 0:
            word_1 = owlNeo4j.get_entity_list_by_name(word_1)[0]
            word_2 = owlNeo4j.get_entity_list_by_name(word_2)[0]
            for word in relatedwords:
                if word in word_1 and word in word_2:
                    return decorate(data='1', style='COM', question=question)
    #按类别查询
    if 'c::' in question:
        category = question.split('c::')[1].strip()
        for node in kb:
            for tag in node['taglist'].split(','):
                score = owlNeo4j.entity_similarity(category, tag)
                if category == tag or score >= 0.5:
                    return decorate('2', 'CAT', question=question)
    #按关系查询
    if 'r::' in question:
        relation = question.split('r::')[1].strip()
        if relation.find('<') == -1:
            for link in links:
                score = serviceWord2vec.get_similarity(
                    list(jieba.cut(relation)), list(jieba.cut(link['name'])))
                if relation == link['name'] or score >= 0.6:
                    return decorate('3', 'LIN', question=question)
        else:
            return decorate('3', 'LIN', question=question)
    #归纳型问题
    seg_list = serviceQA.segment(question)
    #seg_list_complete = []
    for seg in seg_list:
        #seg_list_complete.append(seg.word)
        if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']:
            for seg in seg_list:
                if seg.word in catelist:
                    for seg in seg_list:
                        if seg.word in num_dict:
                            return decorate('4', 'IND', question=question)
    #检索型问题
    for seg in seg_list:
        if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']:
            for seg in seg_list:
                if seg.word in catelist:
                    for seg in seg_list:
                        if seg.word in [u'高于', u'低于', u'等于']:
                            for seg in seg_list:
                                if seg.flag == 'm':
                                    return decorate('5',
                                                    'RET',
                                                    question=question)
    #流程性问题
    pre = sequence_class.question_class(question)
    if pre == 1:
        result = serviceQA.autoseq(question)
        if result != 0:
            return decorate(result, style='QA')
    # 进行中文问答
    qa_result = serviceQA.chinese_qa(question, switch)
    logging.info("qa_result:" +
                 json.dumps(qa_result, encoding='utf-8', ensure_ascii=False))
    if (qa_result is None):
        return None
    # 如果是实体检索
    if 'question' in qa_result:  # 如果存在(实体,关系)对的相似问题
        return decorate(qa_result['question'], style='QUE')
    if len(qa_result['path']) == 0:  # 如果path为空,即不存在关系
        if autopick or (len(qa_result['ents']) == 1):  # 如果开启自动选择或只存在一个实体
            return decorate(qa_result['ents'][0]['neoId'], style='BASIC')
        else:  # 如果存在多个实体且没开启自动选择
            return decorate(qa_result['ents'], style='SNET')
    else:
        if qa_result['ents'][0]['neoId'] == None:
            return decorate(qa_result, style='TS')  # 全文信息检索
        return decorate(qa_result, style='QA')  # 从属性里找答案,或者有匹配的(实体,属性,实体)
Example #6
0
def answer_selection_by_TFIDF_allAttribute_word2vec_hasmostword(
        seg_list,
        attri_value_list,
        answer_num=3,
        tfidf_threshold=0.5,
        w2v_threshold=0.7,
        w2v_sub_threshold=0.85):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,不存在的取大于阈值w2v_threshold的最相似词,
    存在的也取大于阈值w2v_sub_threshold的最相似的分词,将各句子的分词tfidf值与相似度权重相乘再相加,作为这个句子的最终的权重。
    再在对句子排序时,优先包含出现分词多的句子
    与方法:answer_selection_by_attextract_TFIDF_allAttribute的区别是:进行了改进,在句子排序时,优先出现分词多的句子。
    设置阈值:分数排前三的三个句子
    问题:
    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :param answer_num: 返回答案的数目,默认是3
    :param tfidf_threshold: TIIDF挑选答案的权重,默认是0.5
    :param w2v_threshold=0.7: 相对于问句中的分词,属性文本中缺失的分词的word2vec最相似词的最低匹配阈值,默认是0.7
    :param w2v_sub_threshold=0.85: 相对于问句中的分词,属性文本中已存在的分词的word2vec最相似词的最低匹配阈值,默认是0.85
    :return: 得分最高的句子,具有的词数
    '''
    # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    sentences = []
    for attri_value in attri_value_list:
        sentences += [str(sen.strip()) for sen in attri_value.split('。')]
    for sentence in sentences:
        if sentence == '' or len(sentence) == 0:
            sentences.remove(sentence)
    attri_coupus = []
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            attri_coupus.append(' '.join(attri_wordlist))
            #查看分词和匹配结果
            logging.info("sentence: " + sentence + ", attri_wordlist: " +
                         str('/'.join([str(seg) for seg in attri_wordlist])))
    try:
        counter = CountVectorizer(lowercase=False)
        counter.fit(attri_coupus)
        # logging.info("countvectorizer words dict: ")
        # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False))
        counts = counter.transform(attri_coupus)
        # logging.info("one-hot vector: ")
        # logging.info(counts.toarray())
        tfidfer = TfidfTransformer()
        tfidf = tfidfer.fit_transform(counts)
        # logging.info("TFIDF vector: ")
        # logging.info(tfidf.toarray())
        words = counter.get_feature_names()  # 获取词袋模型中的所有词语
        all_fit_wrods = []  # 属性句子中所有要抽取相加的词,包括与问句完全匹配的分词和word2vec匹配最高的词
        all_fit_segs = []  # words中包含的seg
        for seg in seg_list:
            if seg in words:  #语料库中包含问题分词
                all_fit_wrods.append([seg, 1, words.index(seg)])
                all_fit_segs.append([seg, words.index(seg)])
                max_fit_word = ['', w2v_sub_threshold,
                                0]  # 最匹配的词,[0]存词语,[1]存相似度, [2]存分词在语料词典中的位置
                for index, word in enumerate(words):
                    if word != seg:
                        try:
                            if seg not in config.w2v_model:
                                break
                            if word not in config.w2v_model:
                                continue
                            word_similarity = config.w2v_model.similarity(
                                seg, word)
                        except KeyError:
                            # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!")
                            continue
                        if word_similarity > max_fit_word[1]:
                            max_fit_word[0] = word
                            max_fit_word[1] = word_similarity
                            max_fit_word[2] = index
                if max_fit_word[1] > w2v_sub_threshold:
                    logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' +
                                 max_fit_word[0] + " ,similarity:")
                    logging.info(max_fit_word)
                    all_fit_wrods.append(max_fit_word)
            else:
                max_fit_word = ['', w2v_threshold,
                                0]  # 最匹配的词,[0]存词语,[1]存相似度, [2]存分词在语料词典中的位置
                for index, word in enumerate(words):
                    try:
                        if seg not in config.w2v_model:
                            break
                        if word not in config.w2v_model:
                            continue
                        word_similarity = config.w2v_model.similarity(
                            seg, word)
                    except KeyError:
                        # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!")
                        continue
                    if word_similarity > max_fit_word[1]:
                        max_fit_word[0] = word
                        max_fit_word[1] = word_similarity
                        max_fit_word[2] = index
                if max_fit_word[1] > w2v_threshold:
                    logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' +
                                 max_fit_word[0] + " ,similarity:")
                    logging.info(max_fit_word)
                    all_fit_wrods.append(max_fit_word)
        weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
        sentence_and_weight = []
        for i in range(len(
                weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
            sentence_weight = 0.0
            seg_exist_num = 0
            for j in all_fit_wrods:  #计算TFIDF权重
                sentence_weight += weight[i][j[2]] * j[1]
            if sentence_weight > tfidf_threshold:
                for seg_info in all_fit_segs:
                    if weight[i][seg_info[1]] > 0:
                        seg_exist_num += 1
                sentence_and_weight.append(
                    (i, seg_exist_num,
                     sentence_weight))  #0:句子索引,1:句子中包含的问句分词数,2:句子的TFIDF权重
        sentence_and_weight.sort(key=lambda x: x[1] or x[2],
                                 reverse=True)  #排序:先按句子包含的分词数,再按照权重,从大到小排序
        # logging.info(sentence_and_weight)
        max_score = 0.0
        result_sentence = ''
        result_answer_num = answer_num if len(
            sentence_and_weight) > answer_num else len(sentence_and_weight)
        for i in range(result_answer_num):
            result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[
                sentence_and_weight[i][0]] + ';\t\n'
        if result_sentence != '' and len(result_sentence) != 0:
            for head_word in delete_head_words:
                if head_word in result_sentence:
                    result_sentence = result_sentence.replace(head_word, '')
            max_score = sentence_and_weight[0][1]
        else:
            return None
        # logging.info('answer:'+result_sentence)
        # logging.info('point:' + str(max_score))
    except Exception as e:
        logging.info(u'发生异常:', Exception)
        return None
    return {
        'answer': result_sentence.encode('unicode-escape'),
        'point': max_score
    }
Example #7
0
def answer_selection_by_attextract_TFIDF_allAttribute_word2vec(
        seg_list,
        attri_value_list,
        answer_num=3,
        tfidf_threshold=0.5,
        w2v_threshold=0.7):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。
    与方法:answer_selection_by_attextract_TFIDF_allAttribute的区别是:进行了改进,使用提前训练好的word2vec向量进行关键字的匹配,对问句中有而匹配问句文本语料中没有的关键字,
    选取wrod2vec向量最相近的关键字,通过阈值:w2v_threshold控制匹配的精度
    设置阈值:分数排前三的三个句子
    问题:
    1)只对语料中没有的问句分词进行Word2vec匹配,对已有的分词也可以匹配,赋以较低的权重——通过训练能够获得最佳的参数值,比自己凭经验设置要智能和准确
    效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高
    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :param answer_num: 返回答案的数目,默认是3
    :param tfidf_threshold: TIIDF挑选答案的权重,默认是0.5
    :param w2v_threshold=0.8: 相对于问句中的分词,属性文本中缺失的分词的word2vec最相似词的最低匹配阈值,默认是0.7
    :return: 得分最高的句子,具有的词数
    '''
    # logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    sentences = []
    for attri_value in attri_value_list:
        sentences += [str(sen.strip()) for sen in attri_value.split('。')]
    for sentence in sentences:
        if sentence == '' or len(sentence) == 0:
            sentences.remove(sentence)
    attri_coupus = []
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            attri_coupus.append(' '.join(attri_wordlist))
            #查看分词和匹配结果
            # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist])))
    counter = CountVectorizer(lowercase=False)
    counter.fit(attri_coupus)
    # logging.info("countvectorizer words dict: ")
    # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False))
    counts = counter.transform(attri_coupus)
    # logging.info("one-hot vector: ")
    # logging.info(counts.toarray())
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    # logging.info("TFIDF vector: ")
    # logging.info(tfidf.toarray())
    words = counter.get_feature_names()  # 获取词袋模型中的所有词语
    all_fit_wrods = []  # 属性句子中所有要抽取相加的词,包括与问句完全匹配的分词和word2vec匹配最高的词
    for seg in seg_list:
        if seg in words:
            all_fit_wrods.append(seg)
        else:
            max_fit_word = ['', w2v_threshold]  # 最匹配的词,[0]存词语,[1]存相似度
            for word in words:
                if word not in all_fit_wrods:
                    try:
                        word_similarity = config.w2v_model.similarity(
                            seg, word)
                    except KeyError:
                        # logging.info('words:'+seg+','+word+" not in word2vec coupus bank!")
                        continue
                    if word_similarity > max_fit_word[1]:
                        max_fit_word[0] = word
                        max_fit_word[1] = word_similarity
            if max_fit_word[1] > w2v_threshold:
                logging.info('not fit seg: ' + seg + ' ;max_fit_word: ' +
                             max_fit_word[0] + " ,similarity:")
                logging.info(max_fit_word)
                all_fit_wrods.append(max_fit_word[0])
    fit_words_postions = []  #匹配的分词在语料库中的位置下标
    for index, word in enumerate(words):
        if len(fit_words_postions) == len(all_fit_wrods):
            break
        else:
            if word in all_fit_wrods:
                fit_words_postions.append(index)
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    sentence_and_weight = []
    for i in range(
            len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        sentence_weight = 0.0
        for j in fit_words_postions:
            sentence_weight += weight[i][j]
        if sentence_weight > tfidf_threshold:
            sentence_and_weight.append((i, sentence_weight))

    sentence_and_weight.sort(key=lambda x: x[1], reverse=True)  #按照权重,从大到小排序
    # logging.info(sentence_and_weight)
    max_score = 0.0
    result_sentence = ''
    result_answer_num = answer_num if len(
        sentence_and_weight) > answer_num else len(sentence_and_weight)
    for i in range(result_answer_num):
        result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[
            sentence_and_weight[i][0]] + ';\t\n'
    if result_sentence != '' and len(result_sentence) != 0:
        for head_word in delete_head_words:
            if head_word in result_sentence:
                result_sentence = result_sentence.replace(head_word, '')
        max_score = sentence_and_weight[0][1]
    # logging.info('answer:'+result_sentence)
    # logging.info('point:' + str(max_score))
    return {
        'answer': result_sentence.encode('unicode-escape'),
        'point': max_score
    }
Example #8
0
def answer_selection_by_attextract_TFIDF_allAttribute(seg_list,
                                                      attri_value_list,
                                                      answer_num=3,
                                                      threshold=0.1):
    '''
    description:从属性中抽取与问题描述最相似的句子作为答案。
    处理思路:字符串匹配,一次性将所有属性中每句话作为字典的语料库,生成每句话的TFIDF向量,再获取每个向量中包含seg_list中的关键字的权重,相加,作为这个句子的最终的权重。
    设置阈值:分数排前三的三个句子
    问题:
    1)应该加入相似词的匹配机制
    效果:还是有误差,在没有相似词匹配的情况下,比上面的方法有提高
    :param seg_list:提问的问句分词结果
    :param attri_value: 要寻找答案的属性完整句子
    :param answer_num: 返回答案的数目,默认是3
    :param threshold: 挑选答案的权重,默认是0.1
    :return: 得分最高的句子,具有的词数
    '''
    logging.info("seg_list: " + str('/'.join([str(seg) for seg in seg_list])))
    sentences = []
    for attri_value in attri_value_list:
        sentences += [str(sen.strip()) for sen in attri_value.split('。')]
    for sentence in sentences:
        if sentence == '' or len(sentence) == 0:
            sentences.remove(sentence)
    attri_coupus = []
    for sentence in sentences:
        if sentence != '' and len(sentence) != 0:
            all_attri_wordlist = serviceQA.segment(sentence)
            attri_wordlist = [str(word.word) for word in all_attri_wordlist]
            attri_coupus.append(' '.join(attri_wordlist))
            #查看分词和匹配结果
            # logging.info("sentence: "+ sentence+ ", attri_wordlist: "+ str('/'.join([str(seg) for seg in attri_wordlist])))
    counter = CountVectorizer(lowercase=False)
    counter.fit(attri_coupus)
    # logging.info("countvectorizer words dict: ")
    # logging.info(json.dumps(counter.vocabulary_, ensure_ascii=False))
    counts = counter.transform(attri_coupus)
    # logging.info("one-hot vector: ")
    # logging.info(counts.toarray())
    tfidfer = TfidfTransformer()
    tfidf = tfidfer.fit_transform(counts)
    # logging.info("TFIDF vector: ")
    # logging.info(tfidf.toarray())
    word = counter.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    sentence_and_weight = []
    for i in range(
            len(weight)):  # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
        sentence_weight = 0.0
        for j in range(len(word)):
            if word[j] in seg_list:  #是我关心的词汇
                sentence_weight += weight[i][j]
        sentence_and_weight.append((i, sentence_weight))
    sentence_and_weight.sort(key=lambda x: x[1], reverse=True)  #按照权重,从大到小排序
    # logging.info(sentence_and_weight)
    max_score = 0.0
    result_sentence = ''
    if len(sentence_and_weight) >= answer_num:
        for i in range(answer_num):
            if sentence_and_weight[i][1] > threshold:
                result_sentence += "候选答案" + str(i + 1) + ": \t" + sentences[
                    sentence_and_weight[i][0]] + ';\t\n'
    else:
        if len(sentence_and_weight) > 0:
            for i in range(len(sentence_and_weight)):
                if sentence_and_weight[i][1] > threshold:
                    result_sentence += "候选答案" + str(
                        i + 1) + ": \t" + sentences[sentence_and_weight[i]
                                                    [0]] + ';\t\n'
    if result_sentence != '' and len(result_sentence) != 0:
        for head_word in delete_head_words:
            if head_word in result_sentence:
                result_sentence = result_sentence.replace(head_word, '')
        max_score = sentence_and_weight[0][1]
    logging.info('answer:' + result_sentence)
    logging.info('point:' + str(max_score))
    return {
        'answer': result_sentence.encode('unicode-escape'),
        'point': max_score
    }