def check(sent): words = word_tokenize(sent) t = '' code = '' while len(words) > 0: t = " ".join(words) if any(fuzz.UQRatio(t, item) > 80 for item in items): code = 'item' break elif any(fuzz.UQRatio(t, place) > 80 for place in places): code = 'place' break else: words.pop(len(words) - 1) t = '' return t, code
def match_constraint(self, qa_res, constraint, id2linked_ent): non_empty_constr = [(constr_name, constr_val) for constr_name, constr_val in constraint.items() if constr_val != '' and constr_val is not None] for ans in qa_res: linked_ent = id2linked_ent[ans['id']] match_res = None try: match_res = self.constr_extractor.match_constraint( constraint, linked_ent) except: traceback.print_exc() ans['constr_score'] = 0 if match_res is not None and len(match_res) != 0: # logger.info('限制匹配结果: '+str(match_res)) for constr, is_match in match_res.items(): if is_match: ans['constr_score'] += 0.3 ans['constr_name'] = constr ans['constr_val'] = linked_ent['ent'][constr] else: ans['constr_score'] += -0.2 else: for constr_name, constr_val in non_empty_constr: if fuzz.UQRatio(constr_val[0], ans['entity']) >= 60: ans['link_score'] -= 0.3 else: ans['constr_score'] += -0.2
def match_constraint(self, constraint: dict, linked_ent): # check whether limit is available exist_constr = [] res_constr = {} ent = linked_ent['ent'] for constr_name, constr_val in constraint.items(): if constr_val != '' and constr_val is not None \ and fuzz.UQRatio(constr_val[0], ent['name']) < 50: match_constr = False for rel, rel_val in ent.items(): if rel in self.remove_prop: continue rel_val = str(rel_val) if (constr_name in rel or constr_name in rel_val or constr_val[0] in rel or constr_val[0] in rel_val): match_constr = True res_constr[rel] = True exist_constr.append((rel, constr_name)) if not match_constr: return None # filter result time_pattern = re.compile(r'\d+[:, :]') for rel, constr_name in exist_constr: rel_val = ent[rel].lower() if '地点' in constr_name: for item in constraint['地点']: if item not in rel_val: res_constr[rel] = False elif '时间' in constr_name: for item in constraint['时间']: # ''' or item == '最早' or item == '最晚''''' if (item == '24小时' or item == '最早' or item == '最晚') and item not in rel_val: res_constr[rel] = False continue bg_ed = time_pattern.findall(rel_val) bg_ed = [int(x[:-1]) for x in bg_ed] if '时' not in item: if len(bg_ed) == 2: if not (bg_ed[0] < int(item) < bg_ed[1]): res_constr[rel] = False elif '币种' in constr_name or '银行' in constr_name or '航空公司' in constr_name or '价格' in constr_name: for item in constraint[constr_name]: if item not in rel_val: res_constr[rel] = False return res_constr
def link(self, sent, sent_cut, pos_tag, limits=None): # use bert embedding to fuzzy match entities # mention_list = recognize_entity(sent) mention_list = retrieve_mention(sent_cut, pos_tag) if mention_list == []: return [] logger.debug('指称: ' + str(mention_list)) # self.sent_cut = LTP.customed_jieba_cut(sent, cut_stop=True) # print('cut:', self.cut) res = [] for mention in mention_list: mention = mention.lower() one_res = [] if self.is_not_entity(mention): continue # cand_name = self.convert_abstract_verb( # mention, sent, limits) cand_names = self.convert_mention2ent(mention) for ent in self.id2ent.values(): # for ent_name in self.ent_names: ent_name = ent['name'] ent_name_rewrite = self.rewrite_ent_name(ent_name) if ent_name_rewrite == '': continue for cand_name in cand_names: # 该实体为英文而问的有汉语或相反 if contain_chinese(cand_name) and not contain_chinese(ent_name) or contain_english( cand_name) and not contain_english(ent_name): continue RATIO = 0.5 score = cosine_word_similarity(cand_name, ent_name_rewrite) score1 = fuzz.UQRatio(cand_name, ent_name_rewrite)/100 score = RATIO*score + (1-RATIO) * score1 one_res.append({ 'ent': ent, 'mention': mention, 'id': ent['neoId'], 'score': score, 'source': 'rule' }) one_res.sort(key=lambda x: x['score'], reverse=True) for a_res in one_res[:3]: if a_res['score'] > config.simi_ths: res.append(a_res) res.sort(key=lambda x: x['score'], reverse=True) return res
def link(self, sent, sent_cut, pos_tag, limits=None): # use bert embedding to fuzzy match entities # mention_list = recognize_entity(sent) mention_list = retrieve_mention(sent_cut, pos_tag) #mention_list.append("中国航空公司") is_list = False if '哪些' in mention_list: is_list = True logger.debug('指称: ' + str(mention_list)) if mention_list == []: return [] res = [] country_list = ['俄罗斯', '挪威', '美国', '蒙古', '泰国'] if is_list: if all([word in mention_list for word in ['中国', '航空公司']]): for ent in self.id2ent.values(): if '类别' not in ent: continue if ent['类别'] == '国内航空公司': res.append({ 'ent': ent, 'mention': ''.join(['中国', '航空公司']), 'id': ent['neoId'], 'score': 1.5, 'source': 'rule' }) elif any([word in mention_list for word in country_list]) and '航空公司' in mention_list: word = [word for word in mention_list if word in country_list][0] #print(word) for ent in self.id2ent.values(): flag = False if '类别' not in ent: continue if ent['类别'] == '国外航空公司': if '别名' in ent and any( [word in name for name in eval(ent['别名'])]): flag = True if '公司名称' in ent and word in ent['公司名称']: flag = True if flag: print(ent['name'], ent) res.append({ 'ent': ent, 'mention': '国外航空公司', 'id': ent['neoId'], 'score': 1.5, 'source': 'rule' }) return res for mention in mention_list: mention = mention.lower() one_res = [] if not contain_chinese(mention): search_list = [] if '机场' in mention_list: search_list = ['机场三字码', 'ICAO机场代码'] elif '航空公司' in mention_list: search_list = ['IATA代码', 'ICAO代码'] for ent in self.id2ent.values(): if (len(search_list) == 0) or (not any( [key in ent for key in search_list])): continue ent_iata = '' ent_icao = '' ent_three = '' ent_icao_a = '' if 'IATA代码' in ent: ent_iata = ent['IATA代码'] if 'ICAO代码' in ent: ent_icao = ent['ICAO代码'] if '机场三字码' in ent: ent_three = ent['机场三字码'] if 'ICAO机场代码' in ent: ent_icao_a = ent['ICAO机场代码'] if mention.upper() == ent_iata or mention.upper() == ent_icao or \ mention.upper() == ent_three or mention.upper() == ent_icao_a: res.append({ 'ent': ent, 'mention': mention, 'id': ent['neoId'], 'score': 2.5, 'source': 'rule' }) continue if self.is_not_entity(mention): continue # cand_name = self.convert_abstract_verb( # mention, sent, limits) cand_names = self.convert_mention2ent(mention) # entity别名设置 for ent in self.id2ent.values(): # for ent_name in self.ent_names: if 'name' not in ent: continue if '机场' not in mention and (ent['类别'] == '国外机场' or ent['类别'] == '国内机场'): continue ent_name = ent['name'] ent_name_rewrite = self.rewrite_ent_name(ent_name) if ent_name_rewrite == '': continue for cand_name in cand_names: # 该实体为英文而问的有汉语或相反 # 原因:当时用bert encode以后进行相似度匹配的时候,输入的实体或者图谱中的实体有英文的话有时语义不相近bert也会给出较高的值,所以过滤掉只有一方出现英文的情况 if contain_chinese(cand_name) and not contain_chinese( ent_name) or contain_english( cand_name) and not contain_english(ent_name): continue RATIO = 0.5 score = cosine_word_similarity(cand_name, ent_name_rewrite) score1 = fuzz.UQRatio(cand_name, ent_name_rewrite) / 100 score = RATIO * score + (1 - RATIO) * score1 one_res.append({ 'ent': ent, 'mention': mention, 'id': ent['neoId'], 'score': score, 'source': 'rule' }) one_res.sort(key=lambda x: x['score'], reverse=True) for a_res in one_res[:3]: if a_res['score'] > config.simi_ths: res.append(a_res) res.sort(key=lambda x: x['score'], reverse=True) return res
top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.partial_token_sort_ratio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "Partial Token Sort Ratio: {0} : {1} - {2}%".format( item, broker_data[top_index], top_ratio) # NOT BAD!!! for item in raw_data: top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.UQRatio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "UQRatio: {0} : {1} - {2}%".format(item, broker_data[top_index], top_ratio) for item in raw_data: top_ratio = 0 top_index = 0 for index in xrange(len(broker_data)): ratio = fuzz.UWRatio(item, broker_data[index]) if top_ratio < ratio: top_ratio = ratio top_index = index print "UWRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
def extract_rel(self, sent_cut, linked_ent, limits=None, thresh=config.prop_ths): ent = linked_ent['ent'] mention = linked_ent.get('mention', ent['name']) # extract all prop, 限制支持一个 props_dict = {} for prop, value in ent.items(): if prop not in self.remove_prop: props_dict[prop] = str(value) # 计算满足限制 ''' try: res_limit = self.cal_limit(limits, props_dict) except: pdb.set_trace() wrong_restriction = '' accepted_limit = {} if res_limit is None: return None for limit in res_limit.keys(): if not res_limit[limit]: wrong_restriction += ', ' + limit + ' 限制错误' else: accepted_limit[limit] = props_dict[limit] # cut limit_list = list(map(lambda x: x[1], list(limits.items()))) rest_words = list(filter( lambda x: x not in cand_name and '机场' not in x and x not in limit_list, cut_words)) ''' rest_words = [ w for w in sent_cut if w not in mention and '机场' not in w ] props_set = list(props_dict.keys()) props_set.remove('name') # cal prop rel similarity res = [] used_pairs = set() for prop in props_set: old_prop = prop for word in rest_words: prop = prop.replace('服务', '') cos_score = cosine_word_similarity(word, self.normalize_prop(prop)) text_score = fuzz.UQRatio(word, prop) / 100 ratio = 0.6 score = ratio * cos_score + (1 - ratio) * text_score rule_score = self.normalize_ratio(word, prop) score = rule_score if rule_score > 1 else score if word in prop and len(word) > 1: score *= 1.2 if score > thresh and (word, prop) not in used_pairs: used_pairs.add((word, prop)) # res.append([neoId, cand_name, ent_name, { # prop: props_dict[prop]}, accepted_limit, score, ent_score]) res.append({ 'id': ent['neoId'], 'mention': mention, 'entity': ent['name'], 'rel_name': old_prop, 'rel_val': props_dict[old_prop], 'link_score': linked_ent['score'], 'rel_score': score, 'rel_source': 'match' }) if len(res) == 0: return [] res.sort(key=lambda x: x['rel_score'], reverse=True) # res_lang = [] # for item in res: # rel = list(item[3].keys())[0] # val = item[3][rel] # ans = item[2] + '的' + rel + '是' + val # if wrong_restriction != '': # ans += ' ' + wrong_restriction # res_lang.append([ans] + item) # 如果前两个属性都很高,那返回两个答案 sel_num = 1 if len(res) > 1 and res[1]['rel_score'] > 0.91: sel_num += 1 # sel_num = 2 if res_lang[0][res] > 0.91 and res_lang[0][6] > 0.91 else 1 # return res_lang[:min(len(res), sel_num)] return res[:sel_num]
def testFuzzy(self): print( 'ratio', fuzz.ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'ratio', fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_ratio', fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_sort_ratio', fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_sort_ratio', fuzz.partial_token_sort_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'token_set_ratio', fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'partial_token_set_ratio', fuzz.partial_token_set_ratio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'QRatio', fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UQRatio', fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) print( 'WRatio', fuzz.WRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'WRatio', fuzz.WRatio( 'MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL', )) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL', 'MISSION HOSPITAL REGIONAL MEDICAL CENTER')) print( 'UWRatio', fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER', 'MISSION HOSPITAL')) pass
ele1 = data_line[0].split('||') ele2 = data_line[1].split('||') kod1 = ele1[0] firm1 = ele1[1] kod2 = ele2[0] firm2 = ele2[1] # score = fuzz.token_set_ratio(firm1, firm2) score_r = fuzz.ratio(firm1, firm2) score_pr = fuzz.partial_ratio(firm1, firm2) score_tsor = fuzz.token_sort_ratio(firm1, firm2) score_tser = fuzz.token_set_ratio(firm1, firm2) score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2) score_ptser = fuzz.partial_token_set_ratio(firm1, firm2) score_qr = fuzz.QRatio(firm1, firm2) score_uqr = fuzz.UQRatio(firm1, firm2) score_wr = fuzz.WRatio(firm1, firm2) score_uwr = fuzz.UWRatio(firm1, firm2) # print('kod1:' + kod1) # print('firm1:' + firm1) # print('kod2:' + kod2) # print('firm2:' + firm2) # print('score:' + str(score)) # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \ # or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90: if score_tser > 90: temp3 = ( kod1, firm1, kod2, firm2, score_r, score_pr, score_tsor, score_tser, score_ptsor, score_ptser, score_qr,