def load_normal(dir_path): file_names = os.listdir(dir_path) lines = [] for file_name in file_names: temp_lines = data.read_corpus_preprocess(dir_path + file_name) lines += temp_lines return lines
def load_Chinese_Literature_NER_RE_data(): data_dir = '../../data/corpus./Chinese-Literature-NER-RE-Dataset/' file_names = os.listdir(data_dir) lines = [] for file_name in file_names: temp_lines = data.read_corpus_preprocess(data_dir + file_name) lines += temp_lines #print(lines[0]) for line in lines: for i in range(len(line[0])): if line[1][i][1:] == '_Person': line[1][i] = line[1][i].replace('_Person', '-PER') elif line[1][i][1:] == '_Location': line[1][i] = line[1][i].replace('_Location', '-LOC') elif line[1][i][1:] == '_Organization': line[1][i] = line[1][i].replace('_Organization', '-ORG') else: line[1][i] = "O" new_data_list = [] for [chars, nertags, _] in lines: for i in range(len(chars)): if nertags[i][-3:] in ['GPE', 'GEO']: nertags[i] = nertags[i][:2] + 'LOC' new_data_list.append([chars, nertags, data_dir]) #print("load_Chinese_Literature_NER_RE_data ", lines[:3]) return lines
def load_weibo_data(): ner_tag_set = set({'PER', "LOC", "ORG"}) data_dir = '../../data/corpus/Weibo/' file_names = os.listdir(data_dir) lines = [] for file_name in file_names: temp_lines = data.read_corpus_preprocess(data_dir + file_name) lines += temp_lines lines = list( map(lambda x: [list(map(lambda y: y[0], x[0])), x[1], x[2]], lines)) #print(lines[0]) for line in lines: for i in range(len(line[0])): if '.NAM' in line[1][i]: line[1][i] = line[1][i].replace('.NAM', '') else: line[1][i] = "O" new_data_list = [] for [chars, nertags, _] in lines: for i in range(len(chars)): if nertags[i][-3:] in ['GPE', 'GEO']: nertags[i] = nertags[i][:2] + 'LOC' new_data_list.append([chars, nertags, data_dir]) return lines
def load_ProHiryu_bert_chinese_ner_data(): scr_dir = '../../data/corpus/ProHiryu_bert-chinese-ner_data/' file_names = os.listdir(scr_dir) data_list = [] for file_name in file_names: data_list += data.read_corpus_preprocess(scr_dir + file_name) #print("数据量是", len(data_list), data_list[:2]) return data_list
def load_crownpku_ner_data(): data_dir = '../../data/corpus/crownpku_ner_data/' file_names = os.listdir(data_dir) lines = [] for file_name in file_names: temp_lines = data.read_corpus_preprocess(data_dir + file_name) lines += temp_lines return lines
def load_crownpku_Small_Chinese_Corpus(): scr_dir = '../../data/corpus/crownpku_Small-Chinese-Corpus/' file_names = os.listdir(scr_dir) data_list = [] for file_name in file_names: data_list += data.read_corpus_preprocess(scr_dir + file_name) #print("数据量是", len(data_list), data_list[:2]) return data_list
def load_boson_data(): scr_dir = '../../data/corpus/boson_data/boson_ner_format.txt' data_list = data.read_corpus_preprocess(scr_dir) new_data_list = [] for [chars, nertags, _] in data_list: for i in range(len(chars)): #print(nertags[i]) if nertags[i][-4:] not in ['-ORG', '-LOC', '-PER']: nertags[i] = "O" new_data_list.append([chars, nertags, scr_dir]) # print("数据量是", len(data_list), new_data_list[:2]) return new_data_list
def load_ner_data_LatticeLSTM(): '''这份数据的命名实体标记与常见的不同,需要替换一下:GPE-LOC, GEO-LOC(https://github.com/yanqiangmiffy/ner-english)''' scr_dir = '../../data/corpus/ner_data_LatticeLSTM/' file_names = os.listdir(scr_dir) data_list = [] for file_name in file_names: data_list += data.read_corpus_preprocess(scr_dir + file_name) new_data_list = [] for [chars, nertags, _] in data_list: for i in range(len(chars)): if nertags[i][-3:] in ['GPE', 'GEO']: nertags[i] = nertags[i][:2] + 'LOC' if nertags[i][0] == "M": nertags[i] = 'I' + nertags[i][1:] new_data_list.append([chars, nertags, scr_dir]) #print("数据量是", len(data_list), new_data_list[:2]) return new_data_list
def load_mhcao916_ner_data(): data_dir = '../../data/corpus/mhcao916_ner_data/' file_names = os.listdir(data_dir) lines = [] for file_name in file_names: temp_lines = data.read_corpus_preprocess(data_dir + file_name) lines += temp_lines new_data_list = [] for [chars, nertags, _] in lines: for i in range(len(chars)): #print(nertags[i]) if nertags[i][-4:] in ['-SCE', '-DLO']: nertags[i] = nertags[i][0] + '-LOC' elif nertags[i][-4:] in ['-SCE', '-HOT']: nertags[i] = nertags[i][0] + '-ORG' elif nertags[i][-4:] in ['-PER', '-ORG']: pass else: nertags[i] = "O" new_data_list.append([chars, nertags, data_dir]) #print(new_data_list[:5]) return new_data_list