def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file, mode=fasttextConfig.create_data_word): print("create {} sup train data".format(index)) kb_alias_df = pd.read_csv(kb_alia_file) stopwords = data_utils.get_stopword_list(stopword_file) train_datas = open(train_data_file, 'r', encoding='utf-8').readlines() kb_dict = com_utils.pickle_load(kb_dict_file) train_out_file = open(out_file, 'w', encoding='utf-8') text_ids = {} max_extend_countd = 3 for line in tqdm(train_datas, desc='deal {} train file'.format(index)): jstr = ujson.loads(line) text = jstr['text'] text_id = jstr['text_id'] if text_ids.get(text_id) == max_extend_countd: continue mentions = jstr['mention_data'] for mention in mentions: mention_id = mention['kb_id'] mention_text = mention['mention'] neighbor_text = com_utils.get_neighbor_sentence(text, mention_text) # true values kb_entity = kb_dict.get(mention_id) if kb_entity is not None: out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_true, mode) train_out_file.write(out_str) # false values alia_ids = [] alia_count = 0 alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)] for _, item in alias_df.iterrows(): a_id = str(item['subject_id']) if a_id != mention_id: alia_ids.append(a_id) alia_count += 1 if alia_count == max_extend_countd: break if len(alia_ids) > 0: for alia_id in alia_ids: alia_entity = kb_dict.get(alia_id) if alia_entity is not None: out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_false, mode) train_out_file.write(out_str) # add text text_ids = com_utils.dict_add(text_ids, text_id) # 清理资源 train_out_file.close() train_datas = None train_out_file = None kb_alias_df = None stopwords = None kb_dict = None
def analysis_test_ner_result(): print("start analysis test ner result...") ner_test_datas = com_utils.pickle_load( fileConfig.dir_ner + fileConfig.file_ner_test_predict_tag) jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_jieba_kb) out_file = open(fileConfig.dir_result + fileConfig.file_ner_test_result_analysis, 'w', encoding='utf-8') stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) gen_more_words = data_utils.get_stopword_list( fileConfig.dir_stopword + fileConfig.file_analysis_gen_more) text_id = 1 for data in tqdm(ner_test_datas, 'find entity'): text = ''.join(data['text']) tag_list = data['tag'] start_index = 0 mention_length = 0 is_find = False mentions = [] type_dict = {} # use tag find for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or ( tag.find(nerConfig.I_seg) > -1 and not is_find): type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 is_find = True elif tag.find(nerConfig.E_seg) > -1 and not is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 mention = text[start_index:start_index + mention_length] # mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'T': 'NER', 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.I_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.E_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 mention = text[start_index:start_index + mention_length] # mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'T': 'NER', 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} elif tag == nerConfig.O_seg: is_find = False mention_length = 0 type_dict = {} # use jieba find jieba_entities = cut_client.cut_text(text) for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg: # if tag.find(nerConfig.B_seg) > -1 or tag.find(nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1: jieba_offset = i jieba_char = text[i] jieba_text = get_jieba_mention(jieba_entities, jieba_char) if jieba_text is None: continue elif jieba_text == '_' or jieba_text == '-': continue elif len(jieba_text) == 1: continue elif stopwords.get(jieba_text) is not None: continue elif gen_more_words.get(jieba_text) is not None: continue jieba_offset = jieba_offset - jieba_text.find(jieba_char) if len(jieba_text) <= comConfig.max_jieba_cut_len and ( jieba_dict.get(jieba_text) is not None): type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O' if jieba_text is None: continue if not is_already_find_mention(mentions, jieba_text, jieba_offset): # jieba_offset = text.find(jieba_text) mentions.append({ 'T': 'JIEBA', 'mention': jieba_text, 'offset': str(jieba_offset), 'type': type_str }) # find inner brackets mentions bracket_mentions = data_utils.get_mention_inner_brackets( text, tag_list) for mention in bracket_mentions: mention['T'] = 'bracket' if len(bracket_mentions) > 0: mentions += bracket_mentions # completion mentions # mentions_com = [] # for mention in mentions: # mention_str = mention['mention'] # try: # for find in re.finditer(mention_str, text): # find_offset = find.span()[0] # if find_offset != int(mention['offset']): # mentions_com.append( # {'T': 'COM', 'mention': mention['mention'], 'offset': str(find_offset), # 'type': mention['type']}) # except BaseException: # # print("occur error when match mention str in completion mentions, error value:{} text:{}".format( # # mention_str, text)) # pass # mentions_com.append(mention) # mentions = mentions_com # completion mentions out_file.write('\n') result_str = '' for i in range(len(text)): result_str += text[i] + '-' + tag_list[i] + ' ' out_file.write(' text_id:{}, text:{} '.format(text_id, result_str)) out_file.write('\n') out_file.write(' gen_mentions:{} '.format( ujson.dumps(mentions, ensure_ascii=False))) out_file.write('\n') text_id += 1
def create_dev_mention_data(mode, ner_datas, out_file): ner_datas = com_utils.pickle_load(ner_datas) jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_jieba_kb) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) gen_more_words = data_utils.get_stopword_list( fileConfig.dir_stopword + fileConfig.file_analysis_gen_more) text_id = 1 dev_mention_data = [] # count = 0 for data in tqdm(ner_datas, 'find entity'): # count += 1 # if count < 1496: # continue text = ''.join(data['text']) tag_list = data['tag'] start_index = 0 mention_length = 0 is_find = False mentions = [] type_dict = {} # use tag find for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or ( tag.find(nerConfig.I_seg) > -1 and not is_find): type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length = 1 is_find = True elif tag.find(nerConfig.E_seg) > -1 and not is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.I_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.E_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} elif tag == nerConfig.O_seg: is_find = False mention_length = 0 type_dict = {} # use jieba find jieba_entities = cut_client.cut_text(text) for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or tag.find( nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1: jieba_offset = i jieba_char = text[i] jieba_text = get_jieba_mention(jieba_entities, jieba_char, jieba_offset) if jieba_text is None: continue elif jieba_text == '_' or jieba_text == '-': continue elif data_utils.is_punctuation(jieba_text): continue elif len(jieba_text) == 1: continue elif stopwords.get(jieba_text) is not None: continue # elif gen_more_words.get(jieba_text) is not None: # continue jieba_offset = jieba_offset - jieba_text.find(jieba_char) if len(jieba_text) <= comConfig.max_jieba_cut_len and ( jieba_dict.get(jieba_text) is not None): type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O' if jieba_text is None: continue if not is_already_find_mention(mentions, jieba_text, jieba_offset): mentions.append({ 'mention': jieba_text, 'offset': str(jieba_offset), 'type': type_str }) # find inner brackets mentions bracket_mentions = data_utils.get_mention_inner_brackets( text, tag_list) if len(bracket_mentions) > 0: mentions += bracket_mentions # completion mentions # mentions_com = [] # for mention in mentions: # mention_str = mention['mention'] # try: # for find in re.finditer(mention_str, text): # find_offset = find.span()[0] # if find_offset != int(mention['offset']): # mentions_com.append( # {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']}) # except BaseException: # # print("occur error when match mention str in completion mentions, error value:{} text:{}".format( # # mention_str, text)) # pass # mentions_com.append(mention) # mentions = mentions_com # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for mention in mentions: mention_offset = int(mention['offset']) mention_len = len(mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if mention_offset == int(sub_mention['offset']) and len( mention['mention']) > len(sub_mention['mention']): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, mention): change_mentions.append(mention) mentions = change_mentions change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, mention ) and mention['mention'] not in comConfig.punctuation: change_mentions.append(mention) mentions = change_mentions # optim mentions # sort mentions mentions.sort(key=get_offset) # optimize the mention data mentions_optim = [] for mention in mentions: mentions_optim.append({ 'mention': get_optim_mention_text(jieba_entities, mention['mention']), 'offset': mention['offset'], 'type': mention['type'] }) if mode == 1: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) elif mode == 2: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim, 'mention_data_original': data['mention_data_original'] }) elif mode == 3: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) text_id += 1 com_utils.pickle_save(dev_mention_data, out_file) print("success create dev data with mentions, mode:{}".format(mode))
def eval_sup(mode=fasttextConfig.create_data_word): print("start use the fasttext/supervised model to predict eval data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) # unsup_model = fastText.load_model( # fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.model_skipgram)) unsup_model = word2vec.Word2VecKeyedVectors.load( fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model) sup_model = fastText.load_model(fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model) kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_eval_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_eval_data, 'w', encoding='utf-8') # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): if len(line.strip('\n')) == 0: continue jstr = ujson.loads(line) dev_entity = {} text = com_utils.cht_to_chs(jstr['text'].lower()) dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] mentions = [] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = mention['cands'] if len(cands) == 0: continue # use supervised model to choose mention supervise_cands = [] for cand in cands: neighbor_text = com_utils.get_neighbor_sentence( text, com_utils.cht_to_chs(mention_text.lower())) cand_entity = kb_dict.get(cand['cand_id']) if cand_entity is not None: out_str = com_utils.get_entity_mention_pair_text( com_utils.cht_to_chs(cand_entity['text'].lower()), neighbor_text, stopwords, cut_client, mode=mode) result = sup_model.predict(out_str.strip('\n'))[0][0] if result == fasttextConfig.label_true: supervise_cands.append(cand) if len(supervise_cands) == 0: supervise_cands = cands # unsupervise model choose item max_cand = None # score list score_list = [] mention_neighbor_sentence = text for i, cand in enumerate(supervise_cands): # score = fasttext_get_sim(unsup_model, mention_neighbor_sentence, # com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) score = gensim_get_sim( unsup_model, mention_neighbor_sentence, com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) if score < fasttextConfig.min_entity_similarity_threshold: continue score_list.append({ 'cand_id': cand['cand_id'], 'cand_score': score, 'cand_type': cand['cand_type'] }) score_list.sort(key=get_socre_key, reverse=True) if len(score_list) > 0: max_cand = score_list[0] # find the best cand if max_cand is not None: mentions.append({ 'kb_id': max_cand['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for optim_mention in mentions: mention_offset = int(optim_mention['offset']) mention_len = len(optim_mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for optim_mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, optim_mention): change_mentions.append(optim_mention) mentions = change_mentions change_mentions = [] for optim_mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, optim_mention ) and optim_mention['mention'] not in comConfig.punctuation: change_mentions.append(optim_mention) mentions = change_mentions mentions.sort(key=get_mention_offset) dev_entity['mention_data'] = mentions out_file.write(ujson.dumps(dev_entity, ensure_ascii=False)) out_file.write('\n') print("success create supervised eval result")
def predict(): print("start use the fasttext model to predict dev data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) model = fastText.load_model( fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.choose_model)) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_dev_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_fasttext_predict, 'w', encoding='utf-8') # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): jstr = ujson.loads(line) dev_entity = {} text = jstr['text'] dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] mentions = [] for mention in mention_data: mention_text = mention['mention'] cands = mention['cands'] if len(cands) == 0: continue if len(cands) == 1: mentions.append({ 'kb_id': str(cands[0]['cand_id']), 'mention': mention['mention'], 'offset': str(mention['offset']) }) continue max_index = 0 max_score = 0.0 # mention_neighbor_sentence = get_neighbor_sentence(text, mention_text) mention_neighbor_sentence = text for i, cand in enumerate(cands): score = fasttext_get_sim(model, mention_neighbor_sentence, cand['cand_text'], stopwords) if score > max_score: max_score = score max_index = i if max_score < fasttextConfig.min_entity_similarity_threshold: continue mentions.append({ 'kb_id': cands[max_index]['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # filter mentions choose_offset = {} for i, mention in enumerate(mentions): mention_offset = mention['offset'] if choose_offset.get(mention_offset) is not None: if len(choose_offset.get(mention_offset).split('-')[1]) < len( mention['mention']): choose_offset[mention_offset] = str( i) + '-' + mention['mention'] else: choose_offset[mention_offset] = str( i) + '-' + mention['mention'] choose_mentions = [] for key, value in choose_offset.items(): choose_mentions.append(mentions[int(value.split('-')[0])]) dev_entity['mention_data'] = choose_mentions out_file.write(ujson.dumps(dev_entity, ensure_ascii=False)) out_file.write('\n') print("success create predict result")
def test_sup(mode=fasttextConfig.create_data_word): print("start use the fasttext model/supervise model to predict test data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) unsup_model_fasttext = fastText.load_model( fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.choose_model)) unsup_model_gensim = word2vec.Word2VecKeyedVectors.load( fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model) sup_model = fastText.load_model(fileConfig.dir_fasttext + fileConfig.file_fasttext_sup_word_model) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_fasttext_test, 'w', encoding='utf-8') # f1 parmas gen_mention_count = 0 original_mention_count = 0 correct_mention_count = 0 # count = 0 # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): # count += 1 # if count < 3456: # continue jstr = ujson.loads(line) dev_entity = {} text = com_utils.cht_to_chs(jstr['text'].lower()) dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] original_mention_data = jstr['mention_data_original'] mentions = [] for mention in mention_data: mention_text = mention['mention'] if mention_text is None: continue cands = mention['cands'] if len(cands) == 0: continue # use supervised model to choose mention supervise_cands = [] for cand in cands: neighbor_text = com_utils.get_neighbor_sentence( text, com_utils.cht_to_chs(mention_text.lower())) cand_entity = kb_dict.get(cand['cand_id']) if cand_entity is not None: out_str = com_utils.get_entity_mention_pair_text( com_utils.cht_to_chs(cand_entity['text'].lower()), neighbor_text, stopwords, cut_client, mode=mode) # print(out_str) result = sup_model.predict(out_str.replace('\n', ' '))[0][0] if result == fasttextConfig.label_true: supervise_cands.append(cand) # unsupervise model choose item max_cand = None if len(supervise_cands) == 0: supervise_cands = cands # score list score_list = [] mention_neighbor_sentence = text for i, cand in enumerate(supervise_cands): # score_fasttext = fasttext_get_sim(unsup_model_fasttext, mention_neighbor_sentence, # com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) score_gensim = gensim_get_sim( unsup_model_gensim, mention_neighbor_sentence, com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords) # score = (0.8 * score_gensim) + (0.2 * score_fasttext) score = score_gensim # if score > max_score: # max_score = score # max_index = score if score < fasttextConfig.min_entity_similarity_threshold: continue score_list.append({ 'cand_id': cand['cand_id'], 'cand_score': score, 'cand_type': cand['cand_type'] }) # if max_score < fasttextConfig.min_entity_similarity_threshold: # continue # find the best cand # find_type = False score_list.sort(key=get_socre_key, reverse=True) # for item in score_list: # if item['cand_type'] == mention['type']: # find_type = True # if find_type: # for item in score_list: # if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold: # max_cand = item if max_cand is None: if len(score_list) > 0: max_cand = score_list[0] # find the best cand if max_cand is not None: mentions.append({ 'kb_id': max_cand['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for mention in mentions: mention_offset = int(mention['offset']) mention_len = len(mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if mention_offset == int(sub_mention['offset']) and len( mention['mention']) > len(sub_mention['mention']): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, mention): change_mentions.append(mention) mentions = change_mentions change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, mention ) and mention['mention'] not in comConfig.punctuation: change_mentions.append(mention) mentions = change_mentions mentions.sort(key=get_mention_offset) # optim mentions # calc f1 for mention in mentions: if is_find_correct_entity(mention['kb_id'], original_mention_data): correct_mention_count += 1 gen_mention_count += len(mentions) for orginal_mention in original_mention_data: if orginal_mention['kb_id'] != 'NIL': original_mention_count += 1 # out result dev_entity['mention_data'] = mentions dev_entity['mention_data_original'] = original_mention_data out_file.write(ujson.dumps(dev_entity, ensure_ascii=False)) out_file.write('\n') precision = correct_mention_count / gen_mention_count recall = correct_mention_count / original_mention_count f1 = 2 * precision * recall / (precision + recall) print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format( precision, recall, f1))
def test(): print("start use the fasttext model to predict test data") if not os.path.exists(fileConfig.dir_result): os.mkdir(fileConfig.dir_result) model = fastText.load_model( fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.choose_model)) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict) dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_result + fileConfig.file_result_fasttext_test, 'w', encoding='utf-8') # f1 parmas gen_mention_count = 0 original_mention_count = 0 correct_mention_count = 0 # entity diambiguation for line in tqdm(dev_file, 'entity diambiguation'): jstr = ujson.loads(line) dev_entity = {} text = jstr['text'] dev_entity['text_id'] = jstr['text_id'] dev_entity['text'] = jstr['text'] mention_data = jstr['mention_data'] original_mention_data = jstr['mention_data_original'] mentions = [] for mention in mention_data: mention_text = mention['mention'] cands = mention['cands'] if len(cands) == 0: continue # if len(cands) == 1: # mentions.append( # {'kb_id': str(cands[0]['cand_id']), 'mention': mention['mention'], # 'offset': str(mention['offset'])}) # continue max_index = 0 max_score = 0.0 max_cand = None # mention_neighbor_sentence = get_neighbor_sentence(text, mention_text) # score list score_list = [] mention_neighbor_sentence = text for i, cand in enumerate(cands): score = fasttext_get_sim(model, mention_neighbor_sentence, cand['cand_text'], stopwords) # if score > max_score: # max_score = score # max_index = i if score < fasttextConfig.min_entity_similarity_threshold: continue score_list.append({ 'cand_id': cand['cand_id'], 'cand_score': score, 'cand_type': cand['cand_type'] }) # if max_score < fasttextConfig.min_entity_similarity_threshold: # continue # find the best cand find_type = False score_list.sort(key=get_socre_key, reverse=True) for item in score_list: if item['cand_type'] == mention['type']: find_type = True if find_type: for item in score_list: if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold: max_cand = item if max_cand is None: if len(score_list) > 0: max_cand = score_list[0] # find the best cand if max_cand is not None: if is_find_correct_entity(max_cand['cand_id'], original_mention_data): correct_mention_count += 1 mentions.append({ 'kb_id': max_cand['cand_id'], 'mention': mention['mention'], 'offset': mention['offset'] }) # calc f1 params gen_mention_count += len(mentions) original_mention_count += len(original_mention_data) dev_entity['mention_data'] = mentions dev_entity['mention_data_original'] = original_mention_data out_file.write('-' * 20) out_file.write('\n') out_file.write("text_id:{}--text:{}".format(dev_entity['text_id'], dev_entity['text'])) out_file.write('\n') out_file.write("mention_data:") out_file.write('\n') # generate mention for mention in dev_entity['mention_data']: kb_mention = '' if mention['kb_id'] != 'NIL': kb_mention = ujson.dumps(kb_dict[mention['kb_id']], ensure_ascii=False) out_file.write('*' * 20) out_file.write('\n') out_file.write('mention_original: {}'.format(mention)) out_file.write('\n') out_file.write("kb: {}".format(kb_mention)) out_file.write('\n') out_file.write('*' * 20) out_file.write('\n') # original mention out_file.write("kb_data:") out_file.write('\n') for mention in dev_entity['mention_data_original']: kb_mention = '' if mention['kb_id'] != 'NIL': kb_mention = ujson.dumps(kb_dict[mention['kb_id']], ensure_ascii=False) out_file.write('*' * 20) out_file.write('\n') out_file.write('kb_original: {}'.format(mention)) out_file.write('\n') out_file.write("kb: {}".format(kb_mention)) out_file.write('\n') out_file.write('*' * 20) out_file.write('\n') out_file.write('-' * 20) out_file.write('\n') precision = correct_mention_count / gen_mention_count recall = correct_mention_count / original_mention_count f1 = 2 * precision * recall / (precision + recall) print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format( precision, recall, f1))
def create_fasttext_unsup_train_data(): print("start create unsup fasttext data...") if not os.path.exists(fileConfig.dir_fasttext): os.mkdir(fileConfig.dir_fasttext) kb_datas = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8') train_datas = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8') dev_datas = open(fileConfig.dir_data + fileConfig.file_dev_data, 'r', encoding='utf-8') out_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, 'w', encoding='utf-8') stopword_list = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) print("prepare train data") train_sentence = [] # kb data for line in tqdm(kb_datas, desc='deal kb data'): jstr = ujson.loads(line) train_sentence.append(data_utils.get_kb_text(jstr, cut_client, stopword_list)) # train data for line in tqdm(train_datas, desc='deal train data'): jstr = ujson.loads(line) text = jstr['text'] text_len = len(text) save_str = '' str_point = 0 mention_datas = jstr['mention_data'] for mention in mention_datas: mention_offset = int(mention['offset']) mention_text = mention['mention'] sub_text = text[str_point:mention_offset] cut_texts = cut_client.cut_text(sub_text) for s_text in cut_texts: if s_text != ' ': save_str += com_utils.cht_to_chs(s_text) if not s_text.isdigit(): save_str += ' ' if len(sub_text) > 0 and not sub_text.isdigit(): save_str += ' ' str_point += mention_offset - str_point save_str += mention_text mention_text_len = len(mention_text) if mention_text_len > 0 and not mention_text.isdigit(): save_str += ' ' str_point += mention_text_len if str_point < text_len: sub_text = text[str_point:text_len] cut_texts = cut_client.cut_text(sub_text) for s_text in cut_texts: if s_text != ' ': save_str += com_utils.cht_to_chs(s_text) if not s_text.isdigit(): save_str += ' ' train_sentence.append(save_str) # # dev data for line in tqdm(dev_datas, desc='deal dev data'): jstr = ujson.loads(line) text_list = cut_client.cut_text(jstr['text'].lower()) save_str = '' for dev_text in text_list: if dev_text != ' ': save_str += com_utils.cht_to_chs(dev_text) if not dev_text.isdigit(): save_str += ' ' train_sentence.append(save_str) line_len = len(train_sentence) print("save train data, data len:{}".format(line_len)) for i, line in enumerate(train_sentence): if i < line_len - 1: out_file.writelines(line) out_file.write('\n') else: out_file.writelines(line) print("success save fasttext train file")