def create_kb_dict(): if not os.path.exists(fileConfig.dir_kb_info): os.mkdir(fileConfig.dir_kb_info) kb_datas = [line for line in open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8').readlines()] kb_dict = {} for kb_data in tqdm(kb_datas, desc='init kb dict'): kb_data = ujson.loads(kb_data) subject_id = kb_data['subject_id'] if subject_id in kb_dict: raise Exception('key : {} exist'.format(subject_id)) # text = data_utils.get_text(kb_data['data'], kb_data['subject']) all_alias = {} subject = kb_data['subject'] alias = kb_data['alias'] all_alias = com_utils.dict_add(all_alias, subject) for alia in alias: alia_text = alia if all_alias.get(alia_text) is not None: continue all_alias = com_utils.dict_add(all_alias, alia_text) text = data_utils.get_all_text(kb_data['subject'], kb_data['data']) kb_dict[subject_id] = {'type': kb_data['type'], 'subject': subject, 'alias': list(all_alias), 'text': text} com_utils.pickle_save(kb_dict, fileConfig.dir_kb_info + fileConfig.file_kb_dict) print("create kb dict success")
def create_jieba_dict(): data_file = open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8') com_utils.check_dir(fileConfig.dir_jieba) com_utils.check_dir(fileConfig.dir_kb_info) out_file = open(fileConfig.dir_jieba + fileConfig.file_jieba_dict, 'w', encoding='utf-8') words = {} for line in tqdm(data_file, desc='read file'): jstr = ujson.loads(line) subject = jstr['subject'].strip() words = com_utils.dict_add(words, subject) alias = jstr['alias'] for item in alias: words = com_utils.dict_add(words, item.strip()) # save jieba kb com_utils.pickle_save(words, fileConfig.dir_kb_info + fileConfig.file_jieba_kb) # find most common words = Counter(words).most_common() # save file save_str = '' count = 0 for word in tqdm(words): save_str += word[0] + '\n' count += 1 if count % 100 == 0: out_file.write(save_str) save_str = '' if len(save_str) > 0: print("write remid str") out_file.write(save_str) print("success build jieba dict")
def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file, mode=fasttextConfig.create_data_word): print("create {} sup train data".format(index)) kb_alias_df = pd.read_csv(kb_alia_file) stopwords = data_utils.get_stopword_list(stopword_file) train_datas = open(train_data_file, 'r', encoding='utf-8').readlines() kb_dict = com_utils.pickle_load(kb_dict_file) train_out_file = open(out_file, 'w', encoding='utf-8') text_ids = {} max_extend_countd = 3 for line in tqdm(train_datas, desc='deal {} train file'.format(index)): jstr = ujson.loads(line) text = jstr['text'] text_id = jstr['text_id'] if text_ids.get(text_id) == max_extend_countd: continue mentions = jstr['mention_data'] for mention in mentions: mention_id = mention['kb_id'] mention_text = mention['mention'] neighbor_text = com_utils.get_neighbor_sentence(text, mention_text) # true values kb_entity = kb_dict.get(mention_id) if kb_entity is not None: out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_true, mode) train_out_file.write(out_str) # false values alia_ids = [] alia_count = 0 alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)] for _, item in alias_df.iterrows(): a_id = str(item['subject_id']) if a_id != mention_id: alia_ids.append(a_id) alia_count += 1 if alia_count == max_extend_countd: break if len(alia_ids) > 0: for alia_id in alia_ids: alia_entity = kb_dict.get(alia_id) if alia_entity is not None: out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords, cut_client, fasttextConfig.label_false, mode) train_out_file.write(out_str) # add text text_ids = com_utils.dict_add(text_ids, text_id) # 清理资源 train_out_file.close() train_datas = None train_out_file = None kb_alias_df = None stopwords = None kb_dict = None
def get_result_error_list(gen_mentions, original_mentions, gen_more_dict): result_list = [] gen_indexs = [0] * len(gen_mentions) original_indexs = [0] * len(original_mentions) # Traverse gen mentions for i, gen_mention in enumerate(gen_mentions): for j, original_mention in enumerate(original_mentions): if gen_mention['offset'] == original_mention[ 'offset'] and gen_mention['kb_id'] == original_mention[ 'kb_id']: gen_indexs[i] = 1 original_indexs[j] = 1 continue elif gen_mention['offset'] == original_mention[ 'offset'] and gen_mention['kb_id'] != original_mention[ 'kb_id']: gen_indexs[i] = 1 original_indexs[j] = 1 result_list.append({ 'error_type': comConfig.result_error_type_miss, 'gen_mention': gen_mention, 'original_mention': original_mention }) continue if gen_indexs[i] == 0: gen_indexs[i] = 1 result_list.append({ 'error_type': comConfig.result_error_type_gen_more, 'gen_mention': gen_mention }) if len(gen_mention['mention']) > 1: com_utils.dict_add(gen_more_dict, gen_mention['mention']) # Traverse original mentions for i, value in enumerate(original_indexs): if value == 0 and original_mentions[i]['kb_id'] != 'NIL': result_list.append({ 'error_type': comConfig.result_error_type_original_more, 'original_mention': original_mentions[i] }) return result_list
def gen_random_select_list(entity_len_list, list_len): select_list = [] select_dict = {} count = 1 for i in range(len(entity_len_list)): count *= entity_len_list[i] # 生成三次,保证尽量生成全面 for n in range(3): for i in range(count): items = [] for j in range(list_len): items.append(random.randint(0, entity_len_list[j] - 1)) if select_dict.get(items.__str__()) is None: select_list.append(items) select_dict = com_utils.dict_add(select_dict, items.__str__()) return select_list
def analysis_test_ner_result(): print("start analysis test ner result...") ner_test_datas = com_utils.pickle_load( fileConfig.dir_ner + fileConfig.file_ner_test_predict_tag) jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_jieba_kb) out_file = open(fileConfig.dir_result + fileConfig.file_ner_test_result_analysis, 'w', encoding='utf-8') stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) gen_more_words = data_utils.get_stopword_list( fileConfig.dir_stopword + fileConfig.file_analysis_gen_more) text_id = 1 for data in tqdm(ner_test_datas, 'find entity'): text = ''.join(data['text']) tag_list = data['tag'] start_index = 0 mention_length = 0 is_find = False mentions = [] type_dict = {} # use tag find for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or ( tag.find(nerConfig.I_seg) > -1 and not is_find): type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 is_find = True elif tag.find(nerConfig.E_seg) > -1 and not is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 mention = text[start_index:start_index + mention_length] # mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'T': 'NER', 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.I_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.E_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 mention = text[start_index:start_index + mention_length] # mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'T': 'NER', 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} elif tag == nerConfig.O_seg: is_find = False mention_length = 0 type_dict = {} # use jieba find jieba_entities = cut_client.cut_text(text) for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg: # if tag.find(nerConfig.B_seg) > -1 or tag.find(nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1: jieba_offset = i jieba_char = text[i] jieba_text = get_jieba_mention(jieba_entities, jieba_char) if jieba_text is None: continue elif jieba_text == '_' or jieba_text == '-': continue elif len(jieba_text) == 1: continue elif stopwords.get(jieba_text) is not None: continue elif gen_more_words.get(jieba_text) is not None: continue jieba_offset = jieba_offset - jieba_text.find(jieba_char) if len(jieba_text) <= comConfig.max_jieba_cut_len and ( jieba_dict.get(jieba_text) is not None): type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O' if jieba_text is None: continue if not is_already_find_mention(mentions, jieba_text, jieba_offset): # jieba_offset = text.find(jieba_text) mentions.append({ 'T': 'JIEBA', 'mention': jieba_text, 'offset': str(jieba_offset), 'type': type_str }) # find inner brackets mentions bracket_mentions = data_utils.get_mention_inner_brackets( text, tag_list) for mention in bracket_mentions: mention['T'] = 'bracket' if len(bracket_mentions) > 0: mentions += bracket_mentions # completion mentions # mentions_com = [] # for mention in mentions: # mention_str = mention['mention'] # try: # for find in re.finditer(mention_str, text): # find_offset = find.span()[0] # if find_offset != int(mention['offset']): # mentions_com.append( # {'T': 'COM', 'mention': mention['mention'], 'offset': str(find_offset), # 'type': mention['type']}) # except BaseException: # # print("occur error when match mention str in completion mentions, error value:{} text:{}".format( # # mention_str, text)) # pass # mentions_com.append(mention) # mentions = mentions_com # completion mentions out_file.write('\n') result_str = '' for i in range(len(text)): result_str += text[i] + '-' + tag_list[i] + ' ' out_file.write(' text_id:{}, text:{} '.format(text_id, result_str)) out_file.write('\n') out_file.write(' gen_mentions:{} '.format( ujson.dumps(mentions, ensure_ascii=False))) out_file.write('\n') text_id += 1
def create_dev_mention_data(mode, ner_datas, out_file): ner_datas = com_utils.pickle_load(ner_datas) jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_jieba_kb) stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword) gen_more_words = data_utils.get_stopword_list( fileConfig.dir_stopword + fileConfig.file_analysis_gen_more) text_id = 1 dev_mention_data = [] # count = 0 for data in tqdm(ner_datas, 'find entity'): # count += 1 # if count < 1496: # continue text = ''.join(data['text']) tag_list = data['tag'] start_index = 0 mention_length = 0 is_find = False mentions = [] type_dict = {} # use tag find for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or ( tag.find(nerConfig.I_seg) > -1 and not is_find): type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length = 1 is_find = True elif tag.find(nerConfig.E_seg) > -1 and not is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) start_index = i mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.I_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find: elif tag.find(nerConfig.E_seg) > -1 and is_find: type_str = tag.split('_')[1] type_dict = com_utils.dict_add(type_dict, type_str) mention_length += 1 mention = text[start_index:start_index + mention_length] mention = data_utils.strip_punctuation(mention) type_list = Counter(type_dict).most_common() mentions.append({ 'mention': mention, 'offset': str(start_index), 'type': type_list[0][0] }) is_find = False mention_length = 0 type_dict = {} elif tag == nerConfig.O_seg: is_find = False mention_length = 0 type_dict = {} # use jieba find jieba_entities = cut_client.cut_text(text) for i, tag in enumerate(tag_list): # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg: if tag.find(nerConfig.B_seg) > -1 or tag.find( nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1: jieba_offset = i jieba_char = text[i] jieba_text = get_jieba_mention(jieba_entities, jieba_char, jieba_offset) if jieba_text is None: continue elif jieba_text == '_' or jieba_text == '-': continue elif data_utils.is_punctuation(jieba_text): continue elif len(jieba_text) == 1: continue elif stopwords.get(jieba_text) is not None: continue # elif gen_more_words.get(jieba_text) is not None: # continue jieba_offset = jieba_offset - jieba_text.find(jieba_char) if len(jieba_text) <= comConfig.max_jieba_cut_len and ( jieba_dict.get(jieba_text) is not None): type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O' if jieba_text is None: continue if not is_already_find_mention(mentions, jieba_text, jieba_offset): mentions.append({ 'mention': jieba_text, 'offset': str(jieba_offset), 'type': type_str }) # find inner brackets mentions bracket_mentions = data_utils.get_mention_inner_brackets( text, tag_list) if len(bracket_mentions) > 0: mentions += bracket_mentions # completion mentions # mentions_com = [] # for mention in mentions: # mention_str = mention['mention'] # try: # for find in re.finditer(mention_str, text): # find_offset = find.span()[0] # if find_offset != int(mention['offset']): # mentions_com.append( # {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']}) # except BaseException: # # print("occur error when match mention str in completion mentions, error value:{} text:{}".format( # # mention_str, text)) # pass # mentions_com.append(mention) # mentions = mentions_com # optim mentions delete_mentions = [] mentions.sort(key=get_mention_len) for mention in mentions: mention_offset = int(mention['offset']) mention_len = len(mention['mention']) for sub_mention in mentions: if mention_offset != int(sub_mention['offset']) and int( sub_mention['offset']) in range( mention_offset, mention_offset + mention_len): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if mention_offset == int(sub_mention['offset']) and len( mention['mention']) > len(sub_mention['mention']): if not data_utils.is_mention_already_in_list( delete_mentions, sub_mention): delete_mentions.append(sub_mention) if len(delete_mentions) > 0: change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( delete_mentions, mention): change_mentions.append(mention) mentions = change_mentions change_mentions = [] for mention in mentions: if not data_utils.is_mention_already_in_list( change_mentions, mention ) and mention['mention'] not in comConfig.punctuation: change_mentions.append(mention) mentions = change_mentions # optim mentions # sort mentions mentions.sort(key=get_offset) # optimize the mention data mentions_optim = [] for mention in mentions: mentions_optim.append({ 'mention': get_optim_mention_text(jieba_entities, mention['mention']), 'offset': mention['offset'], 'type': mention['type'] }) if mode == 1: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) elif mode == 2: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim, 'mention_data_original': data['mention_data_original'] }) elif mode == 3: dev_mention_data.append({ 'text_id': str(text_id), 'text': text, 'mention_data': mentions_optim }) text_id += 1 com_utils.pickle_save(dev_mention_data, out_file) print("success create dev data with mentions, mode:{}".format(mode))