def entityRecognize(word_list, question): entity_list = [] for word in word_list: entity = "" finalentity = "" for temp_entity in word_list[word_list.index(word):]: entity = entity + temp_entity all_entity = [entity] if len(entity) > 1: # print(entity) # print(1) if entity in mention2entity_dic: # 如果它有对应的实体 for alias in mention2entity_dic[entity]: all_entity.append(alias) for en in all_entity: same_name_entity_list = ccksNeo.get_entity_list_by_name(en) extra_name = ccksNeo.get_entity_info_by_name(en) for name in extra_name: if name[0][-1] == '名' or name[0][-1] == '称': if len(name[1]) > 1: if name[0] != '英文名' and name[0] != '英文名称' and name[0] != '外文名' and name[0] != '外文名称': entity_list.append(name[1]) if len(same_name_entity_list) >= 1: entity_list.append(en) # print(list(set(entity_list))) for entity1 in entity_list: # 如果短的指称被长的指称包含,检测短指称的一度关系名 temp = question for i in entity1: if i in question: temp = temp.replace(i, "") # temp_list = sentence.replace(entity1, "") # segmentor1 = Segmentor() # segmentor1.load("./ltpdata/ltp_data_v3.4.0/cws.model") # temp_list = segmentor1.segment(temp) # segmentor1.release() for entity2 in entity_list: if entity1 != entity2 and entity1 in entity2: # print(2) same_name_entity_list = ccksNeo.get_entity_list_by_name(entity1) flag = 0 for entitydict in same_name_entity_list: # print(entitydict, "用id查") # print(3) relations = ccksNeo.get_related_entities_by_id(entitydict['id']) # print(relations) for relation in relations: # 除掉实体的剩余句子 score = serviceWord2vec.get_similarity(list(jieba.cut(temp)), list(jieba.cut(relation['name']))) if score > 0.2: flag = 1 if flag == 0 and entity1 in entity_list: # print(entity_list) # print(entity1) entity_list.remove(entity1) print("entity_list", entity_list) # time.sleep(10) return entity_list
def entityLink(entity_list, question): # (通过实体名找到数据库中的各实体并通过评分策略找到中心实体) scores = [] allentity_info = [] for name in entity_list: simple_name = name if '_(' in name: simple_name = name[:name.find('_(')] elif '_(' in name: simple_name = name[:name.find('_(')] # print(4) name_simi_score = serviceWord2vec.get_similarity(list(jieba.cut(question)), list(jieba.cut(simple_name))) entity_total = ccksNeo.get_entity_list_by_name(name) # 指称的所有实体 # print(entity_total) in_question_word = 0 temp = question for j in simple_name: if j in question: temp = temp.replace(j, "") in_question_word = in_question_word + 1 temp = question for i in simple_name: if i in question: temp = temp.replace(i, "") # print("temp", temp) temp0 = temp # temp = question.replace(name, "") # 去掉指称的剩余句子 # print(temp) #剩余句子分词 for entity in entity_total: relation_list = [] entity_Id = entity['id'] # print(5) relations = ccksNeo.get_related_entities_by_id(entity['id']) # print(relations) max_relation_score = 0 relation_in_question = 0 for relation in relations: # 不同的关系,可能有类别相同的关系 relation_list.append(relation['name']) score = serviceWord2vec.get_similarity(list(jieba.cut(temp0)), list(jieba.cut(relation['name']))) # 只要实体关系和句子沾边 if score > max_relation_score: max_relation_score = score if relation['name'] in temp0: relation_in_question = 1 link_relation_num = len(relation_list) # relation_list_type = set(relation_list) # link_relation_type_num = len(relation_list_type) # print(question) if "《" + simple_name + "》" in question or "\"" + simple_name + "\"" in question or "“" + simple_name + "”" in question: be_included = 1 else: be_included = 0 relative_position = question.find(simple_name) / len(question) have_quesition_word = 0 # question_word_num = 0 min_distance = 100 for question_word in question_words: if question_word in question: have_quesition_word = 1 # question_word_num = question_word_num+1 if min_distance > abs(question.find(question_word) - question.find(simple_name)): min_distance = abs(question.find(question_word) - question.find(simple_name)) have_alpha_or_digit = 0 pattern1 = re.compile('[0-9]+') pattern2 = re.compile('[a-z]+') pattern3 = re.compile('[A-Z]+') match1 = pattern1.findall(simple_name) match2 = pattern2.findall(simple_name) match3 = pattern3.findall(simple_name) if match1 or match2 or match3: have_alpha_or_digit = 1 entity_length = len(simple_name) if simple_name in question: name_in_question = 1 else: name_in_question = 0 levenshtein_score = Levenshtein.distance(simple_name, question) ''' if name == c_name: is_correct_name =1 else: is_correct_name =0 if entity['keyId'] == c_keyid: is_correct_entity = 1 else: is_correct_entity = 0 print(q_id, entity_keyId, one_relation, link_relation_num, link_relation_type_num, be_included, relative_position, have_quesition_word, min_distance, have_alpha_or_digit, entity_length, is_correct_entity) sentence = q_id+' '+entity_keyId+' '+str(one_relation)+' '+str(link_relation_num)+' '+str(link_relation_type_num)+' '+str(be_included)+' '+str(relative_position)+' '+str(have_quesition_word)+' '+str(min_distance)+' '+str(have_alpha_or_digit)+' '+str(entity_length)+' '+str(is_correct_entity)+'\n' p = open("../NLPCC_KBQA/nlpcc-iccpol-2016.kbqa.training-data_processtry2.txt", 'a', encoding="utf-8") p.writelines(sentence) p.close() ''' entity_info = [name, entity_Id, name_simi_score, in_question_word, max_relation_score, relation_in_question, link_relation_num, be_included, relative_position, have_quesition_word, min_distance, have_alpha_or_digit, entity_length, name_in_question, levenshtein_score] allentity_info.append(entity_info) print(allentity_info) # time.sleep(10) return allentity_info '''