Example #1
0
def entityRecognize(word_list, question):
    entity_list = []
    for word in word_list:
        entity = ""
        finalentity = ""
        for temp_entity in word_list[word_list.index(word):]:
            entity = entity + temp_entity
            all_entity = [entity]
            if len(entity) > 1:
                # print(entity)
                # print(1)
                if entity in mention2entity_dic:  # 如果它有对应的实体
                    for alias in mention2entity_dic[entity]:
                        all_entity.append(alias)
                for en in all_entity:
                    same_name_entity_list = ccksNeo.get_entity_list_by_name(en)
                    extra_name = ccksNeo.get_entity_info_by_name(en)
                    for name in extra_name:
                        if name[0][-1] == '名' or name[0][-1] == '称':
                            if len(name[1]) > 1:
                                if name[0] != '英文名' and name[0] != '英文名称' and name[0] != '外文名' and name[0] != '外文名称':
                                    entity_list.append(name[1])
                    if len(same_name_entity_list) >= 1:
                        entity_list.append(en)
    # print(list(set(entity_list)))
    for entity1 in entity_list:  # 如果短的指称被长的指称包含,检测短指称的一度关系名
        temp = question
        for i in entity1:
            if i in question:
                temp = temp.replace(i, "")
        # temp_list = sentence.replace(entity1, "")
        # segmentor1 = Segmentor()
        # segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        # temp_list = segmentor1.segment(temp)
        # segmentor1.release()
        for entity2 in entity_list:
            if entity1 != entity2 and entity1 in entity2:
                # print(2)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity1)
                flag = 0
                for entitydict in same_name_entity_list:
                    # print(entitydict, "用id查")
                    # print(3)
                    relations = ccksNeo.get_related_entities_by_id(entitydict['id'])
                    # print(relations)
                    for relation in relations:  # 除掉实体的剩余句子
                        score = serviceWord2vec.get_similarity(list(jieba.cut(temp)), list(jieba.cut(relation['name'])))
                        if score > 0.2:
                            flag = 1
                if flag == 0 and entity1 in entity_list:
                    # print(entity_list)
                    # print(entity1)
                    entity_list.remove(entity1)

    print("entity_list", entity_list)
    # time.sleep(10)

    return entity_list
Example #2
0
def entityRecognize(word_list):
    for word in word_list:
        entity = ""
        finalentity = ""
        for temp_entity in word_list[word_list.index(word):]:
            entity = entity + temp_entity
            if len(entity) > 1:
                # print(entity)
                print(1)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity)
                if len(same_name_entity_list) >= 1:
                    entity_list.append(entity)
    # print(entity_list)
    for entity1 in entity_list:  # 如果短的指称被长的指称包含,检测短指称的一度关系名
        for entity2 in entity_list:
            if entity1 != entity2 and entity1 in entity2:
                temp_list = sentence.replace(entity1, "")
                segmentor1 = Segmentor()
                segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                temp_list = segmentor1.segment(temp_list)
                segmentor1.release()
                print(2)
                same_name_entity_list = ccksNeo.get_entity_list_by_name(entity1)
                flag = 0
                for entitydict in same_name_entity_list:
                    print(entitydict, "用id查")
                    print(3)
                    relations = ccksNeo.get_related_entities_by_neoid(entitydict['id'])
                    # print(relations)
                    for relation in relations:  # 除掉实体的剩余句子
                        # print(temp_list, relation['name'])
                        '''
                        segmentor2 = Segmentor()
                        segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                        #print("测试处", relation['name'])
                        relation_list = segmentor2.segment(relation['name'])
                        segmentor2.release()
                        '''
                        # print(temp_list)
                        # print(relation_list)
                        score = serviceWord2vec.get_similarity(temp_list, list(jieba.cut(relation['name'])))
                        # print("测试分数", score)
                        if score > 0.2:
                            flag = 1
                if flag == 0 and entity1 in entity_list:
                    # print(entity_list)
                    # print(entity1)
                    entity_list.remove(entity1)
    print("entity_list",entity_list)
Example #3
0
def entityLink(entity_list, question):  # (通过实体名找到数据库中的各实体并通过评分策略找到中心实体)
    scores = []
    allentity_info = []
    for name in entity_list:
        simple_name = name
        if '_(' in name:
            simple_name = name[:name.find('_(')]
        elif '_(' in name:
            simple_name = name[:name.find('_(')]
        # print(4)

        name_simi_score = serviceWord2vec.get_similarity(list(jieba.cut(question)), list(jieba.cut(simple_name)))
        entity_total = ccksNeo.get_entity_list_by_name(name)  # 指称的所有实体
        # print(entity_total)
        in_question_word = 0
        temp = question
        for j in simple_name:
            if j in question:
                temp = temp.replace(j, "")
                in_question_word = in_question_word + 1

        temp = question
        for i in simple_name:
            if i in question:
                temp = temp.replace(i, "")
        # print("temp", temp)
        temp0 = temp
        # temp = question.replace(name, "")  # 去掉指称的剩余句子

        # print(temp)   #剩余句子分词

        for entity in entity_total:
            relation_list = []
            entity_Id = entity['id']
            # print(5)
            relations = ccksNeo.get_related_entities_by_id(entity['id'])
            # print(relations)
            max_relation_score = 0
            relation_in_question = 0
            for relation in relations:  # 不同的关系,可能有类别相同的关系
                relation_list.append(relation['name'])
                score = serviceWord2vec.get_similarity(list(jieba.cut(temp0)),
                                                       list(jieba.cut(relation['name'])))  # 只要实体关系和句子沾边
                if score > max_relation_score:
                    max_relation_score = score
                if relation['name'] in temp0:
                    relation_in_question = 1
            link_relation_num = len(relation_list)
            # relation_list_type = set(relation_list)
            # link_relation_type_num = len(relation_list_type)

            # print(question)
            if "《" + simple_name + "》" in question or "\"" + simple_name + "\"" in question or "“" + simple_name + "”" in question:
                be_included = 1
            else:
                be_included = 0
            relative_position = question.find(simple_name) / len(question)
            have_quesition_word = 0
            # question_word_num = 0
            min_distance = 100
            for question_word in question_words:
                if question_word in question:
                    have_quesition_word = 1
                    # question_word_num = question_word_num+1
                    if min_distance > abs(question.find(question_word) - question.find(simple_name)):
                        min_distance = abs(question.find(question_word) - question.find(simple_name))
            have_alpha_or_digit = 0
            pattern1 = re.compile('[0-9]+')
            pattern2 = re.compile('[a-z]+')
            pattern3 = re.compile('[A-Z]+')
            match1 = pattern1.findall(simple_name)
            match2 = pattern2.findall(simple_name)
            match3 = pattern3.findall(simple_name)
            if match1 or match2 or match3:
                have_alpha_or_digit = 1
            entity_length = len(simple_name)

            if simple_name in question:
                name_in_question = 1
            else:
                name_in_question = 0

            levenshtein_score = Levenshtein.distance(simple_name, question)
            '''
            if name == c_name:
                is_correct_name =1
            else:
                is_correct_name =0


            if entity['keyId'] == c_keyid:
                is_correct_entity = 1
            else:
                is_correct_entity = 0

            print(q_id, entity_keyId, one_relation, link_relation_num, link_relation_type_num, be_included, relative_position, have_quesition_word, min_distance,
                  have_alpha_or_digit, entity_length, is_correct_entity)

            sentence = q_id+'  '+entity_keyId+'  '+str(one_relation)+'  '+str(link_relation_num)+'  '+str(link_relation_type_num)+'  '+str(be_included)+'  '+str(relative_position)+'  '+str(have_quesition_word)+'  '+str(min_distance)+'  '+str(have_alpha_or_digit)+'  '+str(entity_length)+'  '+str(is_correct_entity)+'\n'
            p = open("../NLPCC_KBQA/nlpcc-iccpol-2016.kbqa.training-data_processtry2.txt", 'a', encoding="utf-8")
            p.writelines(sentence)
            p.close()
            '''
            entity_info = [name, entity_Id, name_simi_score, in_question_word, max_relation_score,
                           relation_in_question,
                           link_relation_num, be_included, relative_position, have_quesition_word, min_distance,
                           have_alpha_or_digit, entity_length, name_in_question, levenshtein_score]

            allentity_info.append(entity_info)

    print(allentity_info)
    # time.sleep(10)

    return allentity_info
    '''
Example #4
0
def entityLink(entity_list, question):  # (通过实体名找到数据库中的各实体并通过评分策略找到中心实体)
    scores = []
    allentity_info = []

    for name in entity_list:
        print(4)
        entity_total = ccksNeo.get_entity_list_by_name(name)  # 指称的所有实体
        print(entity_total)
        temp = question.replace(name, "")  # 去掉指称的剩余句子
        # print(temp)
        segmentor1 = Segmentor()
        segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model")
        temp = list(segmentor1.segment(temp))
        # print(temp)   #剩余句子分词
        segmentor1.release()

        for entity in entity_total:
            # id = q_id
            relation_list = []
            entity_Id = entity['id']
            # print("用id")
            print(5)
            relations = ccksNeo.get_related_entities_by_neoid(entity['id'])
            print(relations)
            max_relation_score = 0
            for relation in relations:  # 不同的关系,可能有类别相同的关系

                relation_list.append(relation['name'])
                '''
                segmentor2 = Segmentor()
                segmentor2.load("./ltpdata/ltp_data_v3.4.0/cws.model")
                temp2 = list(segmentor2.segment(relation['name']))
                segmentor2.release()
                '''
                score = serviceWord2vec.get_similarity(temp, list(jieba.cut(relation['name'])))  # 只要实体关系和句子沾边
                # print(temp, temp2, score)
                if score > max_relation_score:
                    max_relation_score = score

            link_relation_num = len(relation_list)
            relation_list_type = set(relation_list)
            link_relation_type_num = len(relation_list_type)

            # print(question)
            if "《" + name + "》" in question or "\"" + name + "\"" in question or "“" + name + "”" in question:
                be_included = 1
            else:
                be_included = 0
            relative_position = question.find(name) / len(question)
            have_quesition_word = 0
            # question_word_num = 0
            min_distance = 100
            for question_word in question_words:
                if question_word in question:
                    have_quesition_word = 1
                    # question_word_num = question_word_num+1
                    if min_distance > abs(question.find(question_word) - question.find(name)):
                        min_distance = abs(question.find(question_word) - question.find(name))
            have_alpha_or_digit = 0
            pattern1 = re.compile('[0-9]+')
            pattern2 = re.compile('[a-z]+')
            pattern3 = re.compile('[A-Z]+')
            match1 = pattern1.findall(name)
            match2 = pattern2.findall(name)
            match3 = pattern3.findall(name)
            if match1 or match2 or match3:
                have_alpha_or_digit = 1
            entity_length = len(name)
            '''
            if name == c_name:
                is_correct_name =1
            else:
                is_correct_name =0


            if entity['keyId'] == c_keyid:
                is_correct_entity = 1
            else:
                is_correct_entity = 0

            print(q_id, entity_keyId, one_relation, link_relation_num, link_relation_type_num, be_included, relative_position, have_quesition_word, min_distance,
                  have_alpha_or_digit, entity_length, is_correct_entity)

            sentence = q_id+'  '+entity_keyId+'  '+str(one_relation)+'  '+str(link_relation_num)+'  '+str(link_relation_type_num)+'  '+str(be_included)+'  '+str(relative_position)+'  '+str(have_quesition_word)+'  '+str(min_distance)+'  '+str(have_alpha_or_digit)+'  '+str(entity_length)+'  '+str(is_correct_entity)+'\n'
            p = open("../NLPCC_KBQA/nlpcc-iccpol-2016.kbqa.training-data_processtry2.txt", 'a', encoding="utf-8")
            p.writelines(sentence)
            p.close()
            '''
            entity_info = [name, entity_Id, max_relation_score, link_relation_num, link_relation_type_num, be_included,
                           relative_position, have_quesition_word, min_distance,
                           have_alpha_or_digit, entity_length]
            allentity_info.append(entity_info)

    # print(allentity_info)
    return allentity_info
    '''