def load_normal(dir_path):
    file_names = os.listdir(dir_path)
    lines = []
    for file_name in file_names:
        temp_lines = data.read_corpus_preprocess(dir_path + file_name)
        lines += temp_lines
    return lines
def load_Chinese_Literature_NER_RE_data():
    data_dir = '../../data/corpus./Chinese-Literature-NER-RE-Dataset/'
    file_names = os.listdir(data_dir)
    lines = []
    for file_name in file_names:
        temp_lines = data.read_corpus_preprocess(data_dir + file_name)
        lines += temp_lines

    #print(lines[0])
    for line in lines:
        for i in range(len(line[0])):
            if line[1][i][1:] == '_Person':
                line[1][i] = line[1][i].replace('_Person', '-PER')
            elif line[1][i][1:] == '_Location':
                line[1][i] = line[1][i].replace('_Location', '-LOC')
            elif line[1][i][1:] == '_Organization':
                line[1][i] = line[1][i].replace('_Organization', '-ORG')
            else:
                line[1][i] = "O"
    new_data_list = []
    for [chars, nertags, _] in lines:
        for i in range(len(chars)):
            if nertags[i][-3:] in ['GPE', 'GEO']:
                nertags[i] = nertags[i][:2] + 'LOC'
        new_data_list.append([chars, nertags, data_dir])
    #print("load_Chinese_Literature_NER_RE_data ", lines[:3])
    return lines
def load_weibo_data():
    ner_tag_set = set({'PER', "LOC", "ORG"})
    data_dir = '../../data/corpus/Weibo/'
    file_names = os.listdir(data_dir)
    lines = []
    for file_name in file_names:
        temp_lines = data.read_corpus_preprocess(data_dir + file_name)
        lines += temp_lines

    lines = list(
        map(lambda x: [list(map(lambda y: y[0], x[0])), x[1], x[2]], lines))
    #print(lines[0])
    for line in lines:
        for i in range(len(line[0])):
            if '.NAM' in line[1][i]:
                line[1][i] = line[1][i].replace('.NAM', '')
            else:
                line[1][i] = "O"
    new_data_list = []
    for [chars, nertags, _] in lines:
        for i in range(len(chars)):
            if nertags[i][-3:] in ['GPE', 'GEO']:
                nertags[i] = nertags[i][:2] + 'LOC'
        new_data_list.append([chars, nertags, data_dir])
    return lines
def load_ProHiryu_bert_chinese_ner_data():
    scr_dir = '../../data/corpus/ProHiryu_bert-chinese-ner_data/'
    file_names = os.listdir(scr_dir)
    data_list = []
    for file_name in file_names:
        data_list += data.read_corpus_preprocess(scr_dir + file_name)
    #print("数据量是", len(data_list), data_list[:2])
    return data_list
def load_crownpku_ner_data():
    data_dir = '../../data/corpus/crownpku_ner_data/'
    file_names = os.listdir(data_dir)
    lines = []
    for file_name in file_names:
        temp_lines = data.read_corpus_preprocess(data_dir + file_name)
        lines += temp_lines
    return lines
def load_crownpku_Small_Chinese_Corpus():
    scr_dir = '../../data/corpus/crownpku_Small-Chinese-Corpus/'
    file_names = os.listdir(scr_dir)
    data_list = []
    for file_name in file_names:
        data_list += data.read_corpus_preprocess(scr_dir + file_name)
    #print("数据量是", len(data_list), data_list[:2])
    return data_list
def load_boson_data():

    scr_dir = '../../data/corpus/boson_data/boson_ner_format.txt'
    data_list = data.read_corpus_preprocess(scr_dir)
    new_data_list = []
    for [chars, nertags, _] in data_list:
        for i in range(len(chars)):
            #print(nertags[i])
            if nertags[i][-4:] not in ['-ORG', '-LOC', '-PER']:
                nertags[i] = "O"
        new_data_list.append([chars, nertags, scr_dir])

# print("数据量是", len(data_list), new_data_list[:2])
    return new_data_list
def load_ner_data_LatticeLSTM():
    '''这份数据的命名实体标记与常见的不同,需要替换一下:GPE-LOC, GEO-LOC(https://github.com/yanqiangmiffy/ner-english)'''
    scr_dir = '../../data/corpus/ner_data_LatticeLSTM/'
    file_names = os.listdir(scr_dir)
    data_list = []
    for file_name in file_names:
        data_list += data.read_corpus_preprocess(scr_dir + file_name)
    new_data_list = []
    for [chars, nertags, _] in data_list:
        for i in range(len(chars)):
            if nertags[i][-3:] in ['GPE', 'GEO']:
                nertags[i] = nertags[i][:2] + 'LOC'
            if nertags[i][0] == "M":
                nertags[i] = 'I' + nertags[i][1:]
        new_data_list.append([chars, nertags, scr_dir])

    #print("数据量是", len(data_list), new_data_list[:2])
    return new_data_list
def load_mhcao916_ner_data():
    data_dir = '../../data/corpus/mhcao916_ner_data/'
    file_names = os.listdir(data_dir)
    lines = []
    for file_name in file_names:
        temp_lines = data.read_corpus_preprocess(data_dir + file_name)
        lines += temp_lines
    new_data_list = []
    for [chars, nertags, _] in lines:
        for i in range(len(chars)):
            #print(nertags[i])
            if nertags[i][-4:] in ['-SCE', '-DLO']:
                nertags[i] = nertags[i][0] + '-LOC'
            elif nertags[i][-4:] in ['-SCE', '-HOT']:
                nertags[i] = nertags[i][0] + '-ORG'
            elif nertags[i][-4:] in ['-PER', '-ORG']:
                pass
            else:
                nertags[i] = "O"
        new_data_list.append([chars, nertags, data_dir])
    #print(new_data_list[:5])
    return new_data_list