Beispiel #1
0
def entityRecognize(word_list, question):
    entity_list = []
    for word in word_list:
        entity = ""
        finalentity = ""
        for temp_entity in word_list[word_list.index(word):]:
            entity = entity + temp_entity
            all_entity = [entity]
            if len(entity) > 1:
                # print(entity)
                # print(1)
                if entity in mention2entity_dic:  # 如果它有对应的实体
                    for alias in mention2entity_dic[entity]:
                        all_entity.append(alias)
                for en in all_entity:
                    same_name_entity_list = ccksNeo.get_entity_list_by_name(en)
                    extra_name = ccksNeo.get_entity_info_by_name(en)
                    for name in extra_name:
                        if name[0][-1] == '名' or name[0][-1] == '称':
                            if len(name[1]) > 1:
                                if name[0] != '英文名' and name[0] != '英文名称' and name[0] != '外文名' and name[0] != '外文名称':
                                    entity_list.append(name[1])
                    if len(same_name_entity_list) >= 1:
                        entity_list.append(en)
    # print(list(set(entity_list)))
    for entity1 in entity_list:  # 如果短的指称被长的指称包含,检测短指称的一度关系名
        temp = question
        for i in entity1:
            if i in question:
                temp = temp.replace(i, "")
        # temp_list = sentence.replace(entity1, "")
        # segmentor1 = Segmentor()
        # segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        # temp_list = segmentor1.segment(temp)
        # segmentor1.release()
        for entity2 in entity_list:
            if entity1 != entity2 and entity1 in entity2:
                # print(2)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity1)
                flag = 0
                for entitydict in same_name_entity_list:
                    # print(entitydict, "用id查")
                    # print(3)
                    relations = ccksNeo.get_related_entities_by_id(entitydict['id'])
                    # print(relations)
                    for relation in relations:  # 除掉实体的剩余句子
                        score = serviceWord2vec.get_similarity(list(jieba.cut(temp)), list(jieba.cut(relation['name'])))
                        if score > 0.2:
                            flag = 1
                if flag == 0 and entity1 in entity_list:
                    # print(entity_list)
                    # print(entity1)
                    entity_list.remove(entity1)

    print("entity_list", entity_list)
    # time.sleep(10)

    return entity_list
Beispiel #2
0
def entityRecognize(word_list):
    for word in word_list:
        entity = ""
        finalentity = ""
        for temp_entity in word_list[word_list.index(word):]:
            entity = entity + temp_entity
            if len(entity) > 1:
                # print(entity)
                print(1)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity)
                if len(same_name_entity_list) >= 1:
                    entity_list.append(entity)
    # print(entity_list)
    for entity1 in entity_list:  # 如果短的指称被长的指称包含,检测短指称的一度关系名
        for entity2 in entity_list:
            if entity1 != entity2 and entity1 in entity2:
                temp_list = sentence.replace(entity1, "")
                segmentor1 = Segmentor()
                segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                temp_list = segmentor1.segment(temp_list)
                segmentor1.release()
                print(2)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity1)
                flag = 0
                for entitydict in same_name_entity_list:
                    print(entitydict, "用id查")
                    print(3)
                    relations = ccksNeo.get_related_entities_by_neoid(entitydict['id'])
                    # print(relations)
                    for relation in relations:  # 除掉实体的剩余句子
                        # print(temp_list, relation['name'])
                        '''
                        segmentor2 = Segmentor()
                        segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                        #print("测试处", relation['name'])
                        relation_list = segmentor2.segment(relation['name'])
                        segmentor2.release()
                        '''
                        # print(temp_list)
                        # print(relation_list)
                        score = serviceWord2vec.get_similarity(temp_list, list(jieba.cut(relation['name'])))
                        # print("测试分数", score)
                        if score > 0.2:
                            flag = 1
                if flag == 0 and entity1 in entity_list:
                    # print(entity_list)
                    # print(entity1)
                    entity_list.remove(entity1)
    print("entity_list",entity_list)
Beispiel #3
0
def get_realtion_info(relation_candidate, remain_sentence):  # [name, relation, target_entity, target_entity_keyid]
    # temp_relations = ccksNeo.get_entity_info_by_keyid(entity_keyid)  #该实体的信息
    # 实体名,路径,目标实体
    # print(temp_relations)
    relation_info = []
    for candidate in relation_candidate:
        # for key, value in temp_relations.items():  #路径名,目标实体
        segmentor1 = Segmentor()
        segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")

        temp = list(segmentor1.segment(remain_sentence))
        segmentor1.release()
        guanxideci = jieba.cut(candidate[0])
        for word in guanxideci:
            if word in model and word in temp:
                temp.remove(word)
        '''
        segmentor2 = Segmentor()
        segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        temp2 = list(segmentor2.segment(candidate[1]))
        segmentor2.release()
        '''
        ##################jaccard
        temp2 = [candidate[1]]
        set1 = set(temp)
        set2 = set(temp2)
        jaccard = jaccard_distance(set1, set2)
        edit = difflib.SequenceMatcher(None, question, candidate[1]).ratio()
        print(temp, temp2)
        w2v = serviceWord2vec.get_similarity(temp, list(jieba.cut(candidate[1])))
        '''

        if key == c_relation_name:
            is_correct = 1
        else:
            is_correct = 0
        '''
        #
        relation_info.append([candidate[0], candidate[1], candidate[2], candidate[3], jaccard, edit, w2v])
        # 实体,路径名,目标实体,jaccard距离,编辑距离,向量相似度
    # print(relation_info)
    return relation_info
    '''
Beispiel #4
0
def automata(seg_list):
    threshold_1 = 0.6  # 向量相似度匹配的状态转移阈值
    threshold_2 = 0.15  # 关系预测匹配的状态转移阈值
    threshold_3 = 0.4  # 文本答案选择匹配的状态转移阈值
    states = [{
        'header': None,
        'tailer': None,
        'available_words': [],
        'path': [],
        'score': 0
    }]
    caches = {}
    for word in seg_list:
        new_states = []
        for state in states:
            state['available_words'].append(word)
            # 对于START状态
            if (state['header'] is None):
                entity_name = "".join(state['available_words'])
                same_name_entity_list = owlNeo4j.get_entity_list_by_name(
                    entity_name)
                for entity in same_name_entity_list:
                    new_states.append({
                        'header': entity,
                        'tailer': None,
                        'available_words': [],
                        'path': [],
                        'score': 1
                    })
            # 对于非START状态
            else:
                if state['tailer'] is None:
                    source = {
                        'name': state['header']['name'],
                        'label': state['header']['label'],
                        'neoId': state['header']['neoId']
                    }
                else:
                    source = state['tailer']
                if source['neoId'] is None:  # neoId is None 意味着路径走到了一个不可跳转的状态
                    continue
                if source['neoId'] not in caches:  # 整理这个实体的关系与属性集,加入到缓存中等待使用
                    caches[source['neoId']] = []
                    relations = owlNeo4j.get_related_entities_by_id(
                        source['neoId'])
                    for relation in relations:  # 添加关系
                        caches[source['neoId']].append(relation)
                    props = owlNeo4j.get_entity_info_by_id(source['neoId'])
                    for prop in props:  # 添加属性,如果已经有同名关系出现,则该属性不添加
                        if any(prop == relation['name']
                               for relation in caches[source['neoId']]):
                            continue
                        caches[source['neoId']].append({
                            'name':
                            prop,
                            'target_label':
                            '属性值',
                            'target_name':
                            props[prop],
                            'target_neoId':
                            None
                        })
                # 对于所有关系属性逐个进行相似度匹配, 大于阈值就进行状态转移
                link2state_map = {}
                for link in caches[source['neoId']]:
                    score = serviceWord2vec.get_similarity(
                        state['available_words'],
                        list(jieba.cut(link['name'])))
                    if score > threshold_1:
                        # 如果之前没添加过同名关系,直接进行状态转移,记录跳转路径
                        if link['name'] not in link2state_map:
                            new_path = [step for step in state['path']]
                            target = {
                                'name': link['target_name'],
                                'label': link['target_label'],
                                'neoId': link['target_neoId']
                            }
                            new_path.append([source, link['name'], target])
                            new_score = state['score'] * (1 + score -
                                                          threshold_1)
                            new_states.append({
                                'header': state['header'],
                                'tailer': target,
                                'available_words': [],
                                'path': new_path,
                                'score': new_score
                            })
                            link2state_map[link['name']] = len(new_states) - 1
                        # 如果之前已经添加过一个同名关系,说明此关系是多值类(比如:知名校友),直接把此关系追加到同名关系上
                        else:
                            state_num = link2state_map[link['name']]
                            new_tailer = new_states[state_num]['tailer'].copy()
                            new_tailer[
                                'neoId'] = None  # 如果此关系是多值类,则它不能再进行状态转移,所以把tailer neoId标记为None
                            new_states[state_num]['tailer'] = new_tailer
                            target = {
                                'name': link['target_name'],
                                'label': link['target_label'],
                                'neoId': link['target_neoId']
                            }
                            new_states[state_num]['path'].append(
                                [source, link['name'], target])
        states += new_states

    # 选择获取最高评分的那些路径
    max_states = []
    for state in states:
        if (state['header'] is not None):
            if (max_states == []) or (state['score'] > max_states[0]['score']):
                max_states = [state]
            elif (state['score'] == max_states[0]['score']):
                if (state['score']
                        == 1) and (len(state['available_words']) < len(
                            max_states[0]['available_words'])
                                   ):  # 在只识别到了实体的状态里,优先选择最长匹配到的实体
                    max_states = [state]
                else:
                    max_states.append(state)
    # 再对状态里的中心实体根据实体知名度进行排序
    entities = [
        state['header'] for state in max_states if state['header'] is not None
    ]
    entities = serviceKG.eneities_sort(entities)
    # 如果只识别到实体,则返回实体列表,否则返回最优路径
    if (max_states == []) or (max_states[0]['score'] == 0):
        return {'ents': entities, 'path': []}
    else:
        paths = [
            state['path'] for state in max_states
            if state['header'] == entities[0]
        ]
        return {'ents': [entities[0]], 'path': paths[0]}
Beispiel #5
0
def entityLink(entity_list, question):  # (通过实体名找到数据库中的各实体并通过评分策略找到中心实体)
    scores = []
    allentity_info = []
    for name in entity_list:
        simple_name = name
        if '_(' in name:
            simple_name = name[:name.find('_(')]
        elif '_(' in name:
            simple_name = name[:name.find('_(')]
        # print(4)

        name_simi_score = serviceWord2vec.get_similarity(list(jieba.cut(question)), list(jieba.cut(simple_name)))
        entity_total = ccksNeo.get_entity_list_by_name(name)  # 指称的所有实体
        # print(entity_total)
        in_question_word = 0
        temp = question
        for j in simple_name:
            if j in question:
                temp = temp.replace(j, "")
                in_question_word = in_question_word + 1

        temp = question
        for i in simple_name:
            if i in question:
                temp = temp.replace(i, "")
        # print("temp", temp)
        temp0 = temp
        # temp = question.replace(name, "")  # 去掉指称的剩余句子

        # print(temp)   #剩余句子分词

        for entity in entity_total:
            relation_list = []
            entity_Id = entity['id']
            # print(5)
            relations = ccksNeo.get_related_entities_by_id(entity['id'])
            # print(relations)
            max_relation_score = 0
            relation_in_question = 0
            for relation in relations:  # 不同的关系,可能有类别相同的关系
                relation_list.append(relation['name'])
                score = serviceWord2vec.get_similarity(list(jieba.cut(temp0)),
                                                       list(jieba.cut(relation['name'])))  # 只要实体关系和句子沾边
                if score > max_relation_score:
                    max_relation_score = score
                if relation['name'] in temp0:
                    relation_in_question = 1
            link_relation_num = len(relation_list)
            # relation_list_type = set(relation_list)
            # link_relation_type_num = len(relation_list_type)

            # print(question)
            if "《" + simple_name + "》" in question or "\"" + simple_name + "\"" in question or "“" + simple_name + "”" in question:
                be_included = 1
            else:
                be_included = 0
            relative_position = question.find(simple_name) / len(question)
            have_quesition_word = 0
            # question_word_num = 0
            min_distance = 100
            for question_word in question_words:
                if question_word in question:
                    have_quesition_word = 1
                    # question_word_num = question_word_num+1
                    if min_distance > abs(question.find(question_word) - question.find(simple_name)):
                        min_distance = abs(question.find(question_word) - question.find(simple_name))
            have_alpha_or_digit = 0
            pattern1 = re.compile('[0-9]+')
            pattern2 = re.compile('[a-z]+')
            pattern3 = re.compile('[A-Z]+')
            match1 = pattern1.findall(simple_name)
            match2 = pattern2.findall(simple_name)
            match3 = pattern3.findall(simple_name)
            if match1 or match2 or match3:
                have_alpha_or_digit = 1
            entity_length = len(simple_name)

            if simple_name in question:
                name_in_question = 1
            else:
                name_in_question = 0

            levenshtein_score = Levenshtein.distance(simple_name, question)
            '''
            if name == c_name:
                is_correct_name =1
            else:
                is_correct_name =0


            if entity['keyId'] == c_keyid:
                is_correct_entity = 1
            else:
                is_correct_entity = 0

            print(q_id, entity_keyId, one_relation, link_relation_num, link_relation_type_num, be_included, relative_position, have_quesition_word, min_distance,
                  have_alpha_or_digit, entity_length, is_correct_entity)

            sentence = q_id+'  '+entity_keyId+'  '+str(one_relation)+'  '+str(link_relation_num)+'  '+str(link_relation_type_num)+'  '+str(be_included)+'  '+str(relative_position)+'  '+str(have_quesition_word)+'  '+str(min_distance)+'  '+str(have_alpha_or_digit)+'  '+str(entity_length)+'  '+str(is_correct_entity)+'\n'
            p = open("../NLPCC_KBQA/nlpcc-iccpol-2016.kbqa.training-data_processtry2.txt", 'a', encoding="utf-8")
            p.writelines(sentence)
            p.close()
            '''
            entity_info = [name, entity_Id, name_simi_score, in_question_word, max_relation_score,
                           relation_in_question,
                           link_relation_num, be_included, relative_position, have_quesition_word, min_distance,
                           have_alpha_or_digit, entity_length, name_in_question, levenshtein_score]

            allentity_info.append(entity_info)

    print(allentity_info)
    # time.sleep(10)

    return allentity_info
    '''
Beispiel #6
0
def automata(seg_list):
    threshold_1 = 0.5  # 向量相似度匹配的状态转移阈值
    threshold_2 = 0.15  # 关系预测匹配的状态转移阈值
    threshold_3 = 0.4  # 文本答案选择匹配的状态转移阈值
    states = [{
        'header': None,
        'tailer': None,
        'available_words': [],
        'path': [],
        'score': 0
    }]
    caches = {}
    for word in seg_list:
        new_states = []
        for state in states:
            state['available_words'].append(word)
            # 对于START状态
            if (state['header'] is None):
                entity_name = "".join(state['available_words'])
                same_name_entity_list = owlNeo4j.get_entity_list_by_name(
                    entity_name)
                for entity in same_name_entity_list:
                    new_states.append({
                        'header': entity,
                        'tailer': None,
                        'available_words': [],
                        'path': [],
                        'score': 1
                    })
            # 对于非START状态
            else:
                if state['tailer'] is None:
                    source = {
                        'name': state['header']['name'],
                        'label': state['header']['label'],
                        'neoId': state['header']['neoId']
                    }
                else:
                    source = state['tailer']
                if source['neoId'] is None:  # neoId is None 意味着路径走到了一个不可跳转的状态
                    continue
                if source['neoId'] not in caches:  # 整理这个实体的关系与属性集,加入到缓存中等待使用
                    caches[source['neoId']] = []
                    relations = owlNeo4j.get_related_entities_by_id(
                        source['neoId'])
                    for relation in relations:  # 添加关系
                        caches[source['neoId']].append(relation)
                    props = owlNeo4j.get_entity_info_by_id(source['neoId'])
                    for prop in props:  # 添加属性,如果已经有同名关系出现,则该属性不添加
                        if any(prop == relation['name']
                               for relation in caches[source['neoId']]):
                            continue
                        caches[source['neoId']].append({
                            'name':
                            prop,
                            'target_label':
                            '属性值',
                            'target_name':
                            props[prop],
                            'target_neoId':
                            None
                        })
                # 对于所有关系属性逐个进行相似度匹配, 大于阈值就进行状态转移
                link2state_map = {}
                for link in caches[source['neoId']]:
                    score = serviceWord2vec.get_similarity(
                        state['available_words'],
                        list(jieba.cut(link['name'])))
                    if score > threshold_1:
                        # 如果之前没添加过同名关系,直接进行状态转移,记录跳转路径
                        if link['name'] not in link2state_map:
                            new_path = [step for step in state['path']]
                            target = {
                                'name': link['target_name'],
                                'label': link['target_label'],
                                'neoId': link['target_neoId']
                            }
                            new_path.append([source, link['name'], target])
                            new_score = state['score'] * (1 + score -
                                                          threshold_1)
                            new_states.append({
                                'header': state['header'],
                                'tailer': target,
                                'available_words': [],
                                'path': new_path,
                                'score': new_score
                            })
                            link2state_map[link['name']] = len(new_states) - 1
                        # 如果之前已经添加过一个同名关系,说明此关系是多值类(比如:知名校友),直接把此关系追加到同名关系上
                        else:
                            state_num = link2state_map[link['name']]
                            new_tailer = new_states[state_num]['tailer'].copy()
                            new_tailer[
                                'neoId'] = None  # 如果此关系是多值类,则它不能再进行状态转移,所以把tailer neoId标记为None
                            new_states[state_num]['tailer'] = new_tailer
                            target = {
                                'name': link['target_name'],
                                'label': link['target_label'],
                                'neoId': link['target_neoId']
                            }
                            new_states[state_num]['path'].append(
                                [source, link['name'], target])
        states += new_states

    # 如果没有找到答案,则使用关系预测方法
    if all(state['path'] == [] for state in states):
        relation_p = None
        for state in states:
            if (state['header']
                    is not None) and (state['available_words'] != []):
                source = {
                    'name': state['header']['name'],
                    'label': state['header']['label'],
                    'neoId': state['header']['neoId']
                }
                if relation_p is None:
                    question = '_' + ''.join(state['available_words'])
                    res = owlSubServers.relation_predict(question)
                    if res is None:
                        break
                    relation_p = res['answer']
                    point_predicted = res['point']
                    if point_predicted < threshold_2:
                        break
                # 对于所有关系属性逐个进行相似度匹配, 大于阈值就进行状态转移
                for link in caches[source['neoId']]:
                    score = serviceWord2vec.get_similarity(
                        list(jieba.cut(relation_p)),
                        list(jieba.cut(link['name'])))
                    if score > threshold_1:
                        new_path = [step for step in state['path']]
                        target = {
                            'name': link['target_name'],
                            'label': link['target_label'],
                            'neoId': link['target_neoId']
                        }
                        new_path.append([source, link['name'], target])
                        new_score = state['score'] * (1 + score - threshold_1)
                        states.append({
                            'header': state['header'],
                            'tailer': target,
                            'available_words': [],
                            'path': new_path,
                            'score': new_score
                        })

    # 选择标注了头实体的状态,提取头实体的简介,从文本中选择答案
    if all(state['path'] == [] for state in states):
        for state in states:
            if (state['header']
                    is not None) and (state['available_words'] != []):
                description = state['header']['description']
                res = owlSubServers.answer_selection(str(''.join(seg_list)),
                                                     str(description))
                if res is None:
                    break
                answer = res['answer']
                point = float(res['point'])
                if point > threshold_3:
                    abstract = answer if len(
                        answer) < 10 else answer[:8] + '...'
                    new_path = [step for step in state['path']]
                    source = {
                        'name': state['header']['name'],
                        'label': state['header']['label'],
                        'neoId': state['header']['neoId']
                    }
                    target = {
                        'name': abstract,
                        'label': '实体描述文本',
                        'neoId': None,
                        'ans_from_desc': answer
                    }
                    new_path.append([source, 'description', target])
                    new_score = state['score'] + 0.00001
                    states.append({
                        'header': state['header'],
                        'tailer': target,
                        'available_words': [],
                        'path': new_path,
                        'score': new_score
                    })

    # 选择获取最高评分的那些路径
    max_states = []
    for state in states:
        if (state['header'] is not None):
            if (max_states == []) or (state['score'] > max_states[0]['score']):
                max_states = [state]
            elif (state['score'] == max_states[0]['score']):
                if (state['score']
                        == 1) and (len(state['available_words']) < len(
                            max_states[0]['available_words'])
                                   ):  # 在只识别到了实体的状态里,优先选择最长匹配到的实体
                    max_states = [state]
                else:
                    max_states.append(state)
    # 再对状态里的中心实体根据实体知名度进行排序
    entities = [
        state['header'] for state in max_states if state['header'] is not None
    ]
    entities = serviceKG.eneities_sort(entities)
    # 如果只识别到实体,则返回实体列表,否则返回最优路径
    if (max_states == []) or (max_states[0]['score'] == 1):
        return {'ents': entities, 'path': []}
    else:
        paths = [
            state['path'] for state in max_states
            if state['header'] == entities[0]
        ]
        return {'ents': [entities[0]], 'path': paths[0]}
Beispiel #7
0
def knowledge_graph(question, neoid=None, autopick=False):  #autopick表示是否开启自动选择
    # 如果已经选好了实体,直接返回实体检索结果
    if neoid is not None:
        return decorate(neoid, style='BASIC')
    question.strip()
    if any(num in question for num in num_list):
        switch = True
    else:
        switch = False
    for queryword in queryword_list:
        if queryword in question:
            question = question.replace(queryword, '')
    # 比较型问题
    pattern = r'^.+比.+(高|低).*$'
    if re.search(pattern, question.decode('utf-8').encode('utf-8')) != None:
        seg_list = serviceQA.segment(question)
        seg_list_complete = []
        for seg in seg_list:
            seg_list_complete.append(seg.word)
        relatedwords = [u'利率', u'产品利率', u'存款利率', u'贷款利率']
        word_1, word_2 = '', ''
        for seg in seg_list_complete:
            if seg in namelist and seg_list_complete.index(
                    seg) < seg_list_complete.index('比'):
                word_1 = seg
                continue
            if seg in namelist and seg_list_complete.index(
                    seg) > seg_list_complete.index('比'):
                word_2 = seg
                break
        if len(owlNeo4j.get_entity_list_by_name(word_1)) > 0 and len(
                owlNeo4j.get_entity_list_by_name(word_2)) > 0:
            word_1 = owlNeo4j.get_entity_list_by_name(word_1)[0]
            word_2 = owlNeo4j.get_entity_list_by_name(word_2)[0]
            for word in relatedwords:
                if word in word_1 and word in word_2:
                    return decorate(data='1', style='COM', question=question)
    #按类别查询
    if 'c::' in question:
        category = question.split('c::')[1].strip()
        for node in kb:
            for tag in node['taglist'].split(','):
                score = owlNeo4j.entity_similarity(category, tag)
                if category == tag or score >= 0.5:
                    return decorate('2', 'CAT', question=question)
    #按关系查询
    if 'r::' in question:
        relation = question.split('r::')[1].strip()
        if relation.find('<') == -1:
            for link in links:
                score = serviceWord2vec.get_similarity(
                    list(jieba.cut(relation)), list(jieba.cut(link['name'])))
                if relation == link['name'] or score >= 0.6:
                    return decorate('3', 'LIN', question=question)
        else:
            return decorate('3', 'LIN', question=question)
    #归纳型问题
    seg_list = serviceQA.segment(question)
    #seg_list_complete = []
    for seg in seg_list:
        #seg_list_complete.append(seg.word)
        if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']:
            for seg in seg_list:
                if seg.word in catelist:
                    for seg in seg_list:
                        if seg.word in num_dict:
                            return decorate('4', 'IND', question=question)
    #检索型问题
    for seg in seg_list:
        if seg.word in [u'利率', u'产品利率', u'存款利率', u'贷款利率']:
            for seg in seg_list:
                if seg.word in catelist:
                    for seg in seg_list:
                        if seg.word in [u'高于', u'低于', u'等于']:
                            for seg in seg_list:
                                if seg.flag == 'm':
                                    return decorate('5',
                                                    'RET',
                                                    question=question)
    #流程性问题
    pre = sequence_class.question_class(question)
    if pre == 1:
        result = serviceQA.autoseq(question)
        if result != 0:
            return decorate(result, style='QA')
    # 进行中文问答
    qa_result = serviceQA.chinese_qa(question, switch)
    logging.info("qa_result:" +
                 json.dumps(qa_result, encoding='utf-8', ensure_ascii=False))
    if (qa_result is None):
        return None
    # 如果是实体检索
    if 'question' in qa_result:  # 如果存在(实体,关系)对的相似问题
        return decorate(qa_result['question'], style='QUE')
    if len(qa_result['path']) == 0:  # 如果path为空,即不存在关系
        if autopick or (len(qa_result['ents']) == 1):  # 如果开启自动选择或只存在一个实体
            return decorate(qa_result['ents'][0]['neoId'], style='BASIC')
        else:  # 如果存在多个实体且没开启自动选择
            return decorate(qa_result['ents'], style='SNET')
    else:
        if qa_result['ents'][0]['neoId'] == None:
            return decorate(qa_result, style='TS')  # 全文信息检索
        return decorate(qa_result, style='QA')  # 从属性里找答案,或者有匹配的(实体,属性,实体)
Beispiel #8
0
def entityLink(entity_list, question):  # (通过实体名找到数据库中的各实体并通过评分策略找到中心实体)
    scores = []
    allentity_info = []

    for name in entity_list:
        print(4)
        entity_total = ccksNeo.get_entity_list_by_name(name)  # 指称的所有实体
        print(entity_total)
        temp = question.replace(name, "")  # 去掉指称的剩余句子
        # print(temp)
        segmentor1 = Segmentor()
        segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        temp = list(segmentor1.segment(temp))
        # print(temp)   #剩余句子分词
        segmentor1.release()

        for entity in entity_total:
            # id = q_id
            relation_list = []
            entity_Id = entity['id']
            # print("用id")
            print(5)
            relations = ccksNeo.get_related_entities_by_neoid(entity['id'])
            print(relations)
            max_relation_score = 0
            for relation in relations:  # 不同的关系,可能有类别相同的关系

                relation_list.append(relation['name'])
                '''
                segmentor2 = Segmentor()
                segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                temp2 = list(segmentor2.segment(relation['name']))
                segmentor2.release()
                '''
                score = serviceWord2vec.get_similarity(temp, list(jieba.cut(relation['name'])))  # 只要实体关系和句子沾边
                # print(temp, temp2, score)
                if score > max_relation_score:
                    max_relation_score = score

            link_relation_num = len(relation_list)
            relation_list_type = set(relation_list)
            link_relation_type_num = len(relation_list_type)

            # print(question)
            if "《" + name + "》" in question or "\"" + name + "\"" in question or "“" + name + "”" in question:
                be_included = 1
            else:
                be_included = 0
            relative_position = question.find(name) / len(question)
            have_quesition_word = 0
            # question_word_num = 0
            min_distance = 100
            for question_word in question_words:
                if question_word in question:
                    have_quesition_word = 1
                    # question_word_num = question_word_num+1
                    if min_distance > abs(question.find(question_word) - question.find(name)):
                        min_distance = abs(question.find(question_word) - question.find(name))
            have_alpha_or_digit = 0
            pattern1 = re.compile('[0-9]+')
            pattern2 = re.compile('[a-z]+')
            pattern3 = re.compile('[A-Z]+')
            match1 = pattern1.findall(name)
            match2 = pattern2.findall(name)
            match3 = pattern3.findall(name)
            if match1 or match2 or match3:
                have_alpha_or_digit = 1
            entity_length = len(name)
            '''
            if name == c_name:
                is_correct_name =1
            else:
                is_correct_name =0


            if entity['keyId'] == c_keyid:
                is_correct_entity = 1
            else:
                is_correct_entity = 0

            print(q_id, entity_keyId, one_relation, link_relation_num, link_relation_type_num, be_included, relative_position, have_quesition_word, min_distance,
                  have_alpha_or_digit, entity_length, is_correct_entity)

            sentence = q_id+'  '+entity_keyId+'  '+str(one_relation)+'  '+str(link_relation_num)+'  '+str(link_relation_type_num)+'  '+str(be_included)+'  '+str(relative_position)+'  '+str(have_quesition_word)+'  '+str(min_distance)+'  '+str(have_alpha_or_digit)+'  '+str(entity_length)+'  '+str(is_correct_entity)+'\n'
            p = open("../NLPCC_KBQA/nlpcc-iccpol-2016.kbqa.training-data_processtry2.txt", 'a', encoding="utf-8")
            p.writelines(sentence)
            p.close()
            '''
            entity_info = [name, entity_Id, max_relation_score, link_relation_num, link_relation_type_num, be_included,
                           relative_position, have_quesition_word, min_distance,
                           have_alpha_or_digit, entity_length]
            allentity_info.append(entity_info)

    # print(allentity_info)
    return allentity_info
    '''