def get_radical(word):
    radical = Radical(RunOption.Radical)
    rs = [radical.trans_ch(ele) for ele in word]
    str = ''
    for r in rs:
        if r is not None:
            str += r
    return str
Beispiel #2
0
def process_text(idx, split_method=None, split_name='train'):
    '''
    读取文本 切割 然后打上标记 并提取边界、词性、
    偏旁部首、拼音等文本特征'''
    data = {}  #一个很大的字典,包含很多内容
    # ////////////////获取句子///////////////////////////
    if split_method is None:
        with open(f'{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f:
            texts = f.readlines()
    else:
        with open(f'{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f:
            texts = f.read()  #读取整篇文章
            texts = split_method(texts)
    data['word'] = texts

    # /////////////////获取标签/////////////////////////////

    tag_list = ['O' for s in texts for x in s]  # 双重循环
    # print(tag_list)
    tag = pd.read_csv(f'{train_dir}/{idx}.ann', header=None, sep='\t')
    for i in range(tag.shape[0]):
        tag_item = tag.iloc[i][1].split(' ')
        cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        tag_list[start] = 'B-' + cls  #tag_list的下标开始改动
        for j in range(start + 1, end):
            tag_list[j] = 'I-' + cls
    # return tag_list# 只是弄好了一个全文章的标记,但是我们要做成一句话形式的标记匹配

    assert len([x for s in texts for x in s]) == len(tag_list)  # 保证两个序列长度一致

    # text_list =''
    # for t in texts:
    #     text_list += t
    # textes = []
    # tags = []
    # start = 0
    # end = 0
    # max = len(tag_list)
    # for s in texts:
    #     l = len(s)
    #     end += l
    #     tags.append(tag_list[start:end])
    #     start +=l
    # data['label'] = tags  # 做好标签
    # # print(tags,texts) #做好了标签与文本的对应关系

    #///////////////提取词性和词边界特征/////////////////
    word_bounds = ['M' for item in tag_list]  #保存词语的分词边界
    word_flags = []
    for text in texts:
        for word, flag in psg.cut(text):  # word 中国,flag:ns
            if len(word) == 1:  #说明是单个词
                start = len(word_flags)
                word_bounds[start] = 'S'
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'
                word_flags += [flag] * len(word)
                end = len(word_flags) - 1
                word_bounds[end] = 'E'

    #////////统一截断///////////////

    bounds = []
    flags = []
    tags = []
    start = 0
    end = 0
    for s in texts:
        l = len(s)
        end += l
        bounds.append(word_bounds[start:end])
        flags.append(word_flags[start:end])
        tags.append(tag_list[start:end])
        start += l
    data['bound'] = bounds
    data['flag'] = flags
    data['label'] = tags
    # return texts[0], tags[0], bounds[0], flags[0]#此处已经完成了以上四个特征的输出

    # /////////////////获取拼音特征和偏旁部首/////////////////////

    radical = Radical(RunOption.Radical)
    pinyin = Radical(RunOption.Pinyin)
    #对于没有偏旁部首的字标上UNK
    data['radical'] = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]
    data['pinyin'] = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]
    # return texts[0], tags[0], bounds[0], flags[0], data['radical'][0], data['pinyin'][0]
    #数据的几个特征都对应上了。

    #//////////////////////////存储数据///////////////////////////////////

    num_samples = len(texts)
    num_col = len(data.keys())
    dataset = []
    for i in range(num_samples):
        records = list(zip(*[list(v[i]) for v in data.values()]))
        #这个符号是解压的意思
        dataset += records + [['sep'] * num_col]  # 一句话结束后,要用sep进行隔开。
    dataset = dataset[:-1]  #最后一个sep不要隔开
    dataset = pd.DataFrame(dataset, columns=data.keys())
    save_path = f'data/{split_name}/{idx}.csv'

    def clean_word(w):  #现在可以去掉空格,标记已经打好了。
        if w == '\n':
            return 'LB'
        if w in [' ', '\t', '\u2003']:
            return 'SPACE'
        if w.isdigit():
            return 'NUM'  #将数字变成统一的数字
        return w

    dataset['word'] = dataset['word'].apply(clean_word)

    dataset.to_csv(save_path, index=False, encoding='utf-8')
Beispiel #3
0
from cnradical import Radical, RunOption

radical = Radical(RunOption.Radical)
pinyin = Radical(RunOption.Pinyin)

text = '你好,今天早上吃饭了吗?Eastmount'
radical_out = [radical.trans_ch(ele) for ele in text]
pinyin_out = [pinyin.trans_ch(ele) for ele in text]
print(radical_out)
print(pinyin_out)

radical_out = radical.trans_str(text)
pinyin_out = pinyin.trans_str(text)
print(radical_out)
print(pinyin_out)
Beispiel #4
0
def input_from_line_with_feature(line):
    """
    此函数将单一输入句子进行实体识别,构造为具体如下形式
    [[[raw_text]], [[word]], [[bound]], [[flag]], [[label]], [[radical]], [[pinyin]]]
    这里多一列,到时候输入为[1:]
    :param line:输入的单一句子
    :param char_to_id:词典转索引
    :return:
    """
    with open(f'datas/prepare_data/dict.pkl', 'rb') as f:
        map_dict = pickle.load(f)

    def item2id(data, w2i):
        return [w2i[x] if x in w2i else w2i['UNK'] for x in data]

    inputs = list()
    feature_names = ['word', 'bound', 'flag', 'radical', 'pinyin', 'label']
    line = full_to_half(line)
    line = replace_html(line)
    chars = [[char for char in line]]
    # 获取标签,先全部打上O
    tag_list = ['O' for _ in line]
    # 提取词性和词边界特征
    word_bounds = ['M' for _ in tag_list]  # 保存每个词的边界
    word_flags = []  # 保存词性
    # 遍历带词性的切分
    for word, flag in psg.cut(line):
        # 单个词的时候
        if len(word) == 1:
            start = len(word_flags)
            word_bounds[start] = 'S'
            word_flags.append(flag)
        else:
            start = len(word_flags)
            word_bounds[start] = 'B'
            word_flags += [flag] * len(word)
            # 这里end需要-1
            end = len(word_flags) - 1
            word_bounds[end] = 'E'
    bounds = [word_bounds]
    flags = [word_flags]
    # 由于是测试将label置为空
    targets = [[]]
    # 获取偏旁和拼音特征
    radical = Radical(RunOption.Radical)
    pinyin = Radical(RunOption.Pinyin)
    # 这里循环迭代去获取,None的去填充
    radicals = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK'
        for x in line
    ]]
    pinyins = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK'
        for x in line
    ]]
    inputs.append(chars)
    inputs.append(bounds)
    inputs.append(flags)
    inputs.append(radicals)
    inputs.append(pinyins)
    inputs.append(targets)
    # 开始循环转化为数字索引
    id_inputs = [[line]]
    for i, feature in enumerate(feature_names):
        id_inputs.append([item2id(inputs[i][0], map_dict[feature][2])])
    return id_inputs[0][0], id_inputs[1:]
def process_text(idx, split_method=None, split_name='train'):
    """
    功能: 读取文本并切割,接着打上标记及提取词边界、词性、偏旁部首、拼音等特征
    param idx: 文件的名字 不含扩展名
    param split_method: 切割文本方法
    param split_name: 存储数据集 默认训练集, 还有测试集
    return
    """

    #定义字典 保存所有字的标记、边界、词性、偏旁部首、拼音等特征
    data = {}

    #--------------------------------------------------------------------
    #                            获取句子
    #--------------------------------------------------------------------
    if split_method is None:
        #未给文本分割函数 -> 读取文件
        with open(f'data/{train_dir}/{idx}.txt',
                  encoding='utf8') as f:  #f表示文件路径
            texts = f.readlines()
    else:
        #给出文本分割函数 -> 按函数分割
        with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f:
            outfile = f'data/train_data_pro/{idx}_pro.txt'
            print(outfile)
            texts = f.read()
            texts = split_method(texts, outfile)

    #提取句子
    data['word'] = texts
    print(texts)

    #--------------------------------------------------------------------
    #                             获取标签(实体类别、起始位置)
    #--------------------------------------------------------------------
    #初始时将所有汉字标记为O
    tag_list = ['O' for s in texts for x in s]  #双层循环遍历每句话中的汉字

    #读取ANN文件获取每个实体的类型、起始位置和结束位置
    tag = pd.read_csv(f'data/{train_dir}/{idx}.ann', header=None,
                      sep='\t')  #Pandas读取 分隔符为tab键
    #0 T1 Disease 1845 1850  1型糖尿病

    for i in range(tag.shape[0]):  #tag.shape[0]为行数
        tag_item = tag.iloc[i][1].split(' ')  #每一行的第二列 空格分割
        #print(tag_item)
        #存在某些实体包括两段位置区间 仅获取起始位置和结束位置
        cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        #print(cls,start,end)

        #对tag_list进行修改
        tag_list[start] = 'B-' + cls
        for j in range(start + 1, end):
            tag_list[j] = 'I-' + cls

    #断言 两个长度不一致报错
    assert len([x for s in texts for x in s]) == len(tag_list)
    #print(len([x for s in texts for x in s]))
    #print(len(tag_list))

    #--------------------------------------------------------------------
    #                       分割后句子匹配标签
    #--------------------------------------------------------------------
    tags = []
    start = 0
    end = 0
    #遍历文本
    for s in texts:
        length = len(s)
        end += length
        tags.append(tag_list[start:end])
        start += length
    print(len(tags))
    #标签数据存储至字典中
    data['label'] = tags

    #--------------------------------------------------------------------
    #                       提取词性和词边界
    #--------------------------------------------------------------------
    #初始标记为M
    word_bounds = ['M' for item in tag_list]  #边界 M表示中间
    word_flags = []  #词性

    #分词
    for text in texts:
        #带词性的结巴分词
        for word, flag in psg.cut(text):
            if len(word) == 1:  #1个长度词
                start = len(word_flags)
                word_bounds[start] = 'S'  #单个字
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'  #开始边界
                word_flags += [flag] * len(word)  #保证词性和字一一对应
                end = len(word_flags) - 1
                word_bounds[end] = 'E'  #结束边界
    #存储
    bounds = []
    flags = []
    start = 0
    end = 0
    for s in texts:
        length = len(s)
        end += length
        bounds.append(word_bounds[start:end])
        flags.append(word_flags[start:end])
        start += length
    data['bound'] = bounds
    data['flag'] = flags

    #--------------------------------------------------------------------
    #                         获取拼音和偏旁特征
    #--------------------------------------------------------------------
    radical = Radical(RunOption.Radical)  #提取偏旁部首
    pinyin = Radical(RunOption.Pinyin)  #提取拼音

    #提取拼音和偏旁 None用特殊符号替代UNK
    radical_out = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]
    pinyin_out = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]

    #赋值
    data['radical'] = radical_out
    data['pinyin'] = pinyin_out

    #--------------------------------------------------------------------
    #                              存储数据
    #--------------------------------------------------------------------
    #获取样本数量
    num_samples = len(texts)  #行数
    num_col = len(data.keys())  #列数 字典自定义类别数 6
    print(num_samples)
    print(num_col)

    dataset = []
    for i in range(num_samples):
        records = list(zip(*[list(v[i]) for v in data.values()]))  #压缩
        dataset += records + [['sep'] * num_col]  #每处理一句话sep分割
    #records = list(zip(*[list(v[0]) for v in data.values()]))
    #for r in records:
    #    print(r)

    #最后一行sep删除
    dataset = dataset[:-1]
    #转换成dataframe 增加表头
    dataset = pd.DataFrame(dataset, columns=data.keys())
    #保存文件 测试集 训练集
    save_path = f'data/prepare/{split_name}/{idx}.csv'
    dataset.to_csv(save_path, index=False, encoding='utf-8')

    #--------------------------------------------------------------------
    #                       处理换行符 w表示一个字
    #--------------------------------------------------------------------
    def clean_word(w):
        if w == '\n':
            return 'LB'
        if w in [' ', '\t', '\u2003']:  #中文空格\u2003
            return 'SPACE'
        if w.isdigit():  #将所有数字转换为一种符号 数字训练会造成干扰
            return 'NUM'
        return w

    #对dataframe应用函数
    dataset['word'] = dataset['word'].apply(clean_word)

    #存储数据
    dataset.to_csv(save_path, index=False, encoding='utf-8')
Beispiel #6
0
def process_text(idx, split_method=None, split_name='train'):  # 用来接收切分好的句子
    """
    读取文本---切割---然后打上标记---并提取词边界、词性、部首、拼音、等文本特征
    :param idx: 文件的名字,不含扩展名
    :param split_method: 切割文本的函数---来自上一个写好的程序
    split_name: 最终保存的文件夹名字
    :return:

    """
    data = {}
    #------------------------------切分句子----------------------------------------

    if split_method is None:  # 如果未将切分函数传给split_method
        with open(f'../datas/{train_dir}/{idx}.txt', 'r',
                  encoding='utf8') as f:
            # 打开路径的新写法,读取不同的文件
            texts = f.readlines()  # 按行,做简单的切割
    else:
        with open(f'../datas/{train_dir}/{idx}.txt', 'r',
                  encoding='utf8') as f:
            texts = f.read()  # 读整篇文章
            texts = split_method(texts)  # 调用上次的切分函数
    data['word'] = texts
    # 从one中提取切分好的句子,然后作为一个值赋给data字典里['word']这个键

    #------------------------------依据.ann人工标注,首先给   每一个字都打上标签----------------------------------------
    # 一、 .txt,对每一个字标记为'O'
    tag_list = ['O' for s in texts
                for x in s]  # s遍历了text里的每一句话,再遍历每一句话里的每一个字---双重循环
    # return tag_list # (检查站: main-1用于查看给所有字标记'O')

    tag = pd.read_csv(f'../datas/{train_dir}/{idx}.ann', header=None,
                      sep='\t')  # 读取对应的.ann文件

    # 二、 获取.ann人工标注中的类别和边界
    for i in range(tag.shape[0]):  # 这里是做一个文件里的第一行,即一行一行读
        tag_item = tag.iloc[i][1].split(' ')
        # 取每一行的第二列,获取的实体类别以及起始位置,eg:Disease 18445 1850
        # print(tag_item) # (检查站)
        cls, start, end = tag_item[0], int(tag_item[1]), int(
            tag_item[-1])  # 字符串转换成整数
        # 分别抽取出实体类别,起始位置,终止位置,分别保存在cls,start,end里

        # 三、 按照人工标注给已经标注为'O'的根据句子长度,标注B,I标签
        tag_list[start] = 'B-' + cls  # 每一句话中,给起始位置打B+类别名
        for j in range(start + 1, end):  # 起始位置之后...到最后的结束为止
            tag_list[j] = 'I-' + cls  # 后面的位置打I+类别名
    assert len([x for s in texts for x in s]) == len(tag_list)
    # 保证两个序列长度一致,即需要对切分好的每一句话里面的每一个字的长度==打完标记后的每一个字的长度
    # texts是切分好的句子,s遍历这些切分好的句子,x再遍历每一个句子的每一个字,
    # text_list是整篇文章根据人工标注打的bi标签,对起始位置和结束位置标记好的实体结果
    # return  tag_list # (检查站:可断点查看)*

    #-----------------------------提取词性和词边界特征----------------------------------------
    word_bounds = ['M' for item in tag_list
                   ]  # 定义词边界,全部标记为M,text_list是对起始位置和结束位置标记好的实体
    word_flags = []  # 用来保存每次切好的词,并给下一次切根据长度提供起始位置
    for text in texts:
        for word, flag in psg.cut(text):  # 对每一句话进行带词性切词
            if len(word) == 1:  # 单个字(词)时
                start = len(word_flags)  # 确定起始位置,起始位0
                word_bounds[start] = 'S'  # 对单个的词标注修改为S
                word_flags.append(flag)  # 把标注好的词添加进word_flags
            else:  # 当不是单个词时
                start = len(word_flags)  # 确定起始位置
                word_bounds[start] = 'B'  # 每个词的第一个字符标注位B
                word_flags += [flag] * len(word)  # 将这个词的每个字都加上词性标记
                end = len(word_flags) - 1  # 确定结束位置
                word_bounds[end] = 'E'  # 将最后一个字标注位E

    #---------统一做,本来放在上面,现在和共用一个循环-------------------------------------------------
    # 需要对text_list同样进行切分,切成texts一样长度的句子,从一整篇文章变成一句话一句话的样子
    tags = []
    bounds = []
    flags = []
    start = 0
    end = 0
    for s in texts:  # 一句话一句话遍历整个.txt文本的句子
        l = len(s)  # 计算每个切分好的句子的长度
        end += l  # 结束位置
        bounds.append(
            word_bounds[start:end])  # 按照切分好句子的长度对打完标签的txst_list标签进行截取
        flags.append(word_flags[start:end])  # 按照切分好句子的长度对打完标签的txst_list标签进行截取
        tags.append(tag_list[start:end])
        start += l  # 定位下一句话的起始位置

    data['bound'] = bounds  # 词边界特征
    data['flag'] = flags  # 词性特征
    data['label'] = tags

    # -----------------------------提取拼音特征----------------------------------------
    radical = Radical(RunOption.Radical)  # 提取偏旁部首
    pinyin = Radical(RunOption.Pinyin)  # 提取拼音
    # 通过库提取偏旁特征,对没有偏旁的标注位 UNK
    data['radical'] = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]  # 列表推导式
    # 提取拼音特征,对没有拼音的标注PAD
    data['pinyin'] = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]  # 列表推导式

    # return texts[1],tags[1],bounds[1],flags[1],data['radical'][1],data['pinyin'][1] # (检查站)
    # return (len(data['label'])) #(检查站)

    #----------------------------------合并和存储切分好的文件---------------------------------------
    num_samples = len(texts)  # 有多少句话
    num_col = len(data.keys())  # 统计有多少列

    dataset = []
    for i in range(num_samples):
        records = list(
            zip(*[list(v[i])
                  for v in data.values()]))  # 将6个列表分别拆分成一个字一个列表,*=解压,无*=压缩
        dataset += records + [['sep'] * num_col]  # 每存完一个句子需要一行sep进行隔离,sep为元组格式
    dataset = dataset[:-1]  # 最后一行sep不要

    dataset = pd.DataFrame(dataset, columns=data.keys())  # 转换成数据框
    save_path = f'../data/prepare/{split_name}/{idx}.csv'

    #print(dataset) # (检查站)

    def clean_word(w):  # 由于已经标注完成,可以清理换行,空格等符号
        if w == '\n':  # 换行
            return 'LB'
        if w in [' ', '\t', '\u2003']:  # 空格,TAB,中文空格
            return 'SPACE'
        if w.isdigit():  # 命名实体识别不关心数字是多少,所以把数字变成一个类
            return 'num'
        return w

    dataset['word'] = dataset['word'].apply(clean_word)  # 存储之前进行清洗
    dataset.to_csv(save_path, index=False, encoding='utf-8')
def process_text(file_id, split_method=None, split_name='train_small'):
    """
    本函数作用:用于读取文本、切割、打标记  并提取词边界、词性、偏旁部首和拼音等文本特征。
    :param file_name: 文件的名字,不含扩展名(.txt 和 .ann)
    :param split_method:  切割文本的方法(是一个函数)
    :param split_name:  最终保存的文件夹的名字
    :return:
    """
    data = {}  # 用来存所有的字,标记,边界,偏旁部首,拼音等。

    # ****************************************获取句子********************************************

    if split_method == None:  # 如果没有切割文本的方法,就一行当做一个句子处理。
        with open(f'{train_dir}/{file_id}.txt', encoding='utf-8') as f:
            texts = f.readlines()
    else:
        with open(f'{train_dir}/{file_id}.txt', encoding='utf-8') as f:
            texts = f.read()
            texts = split_method(texts)
            # print(texts)
    data['word'] = texts

    # ****************************************获取标签*********************************************
    tag_list = ['O' for sent in texts for word in sent]
    tag = pd.read_csv(f'{train_dir}/{file_id}.ann', header=None, sep='\t')
    for row in range(tag.shape[0]):
        tag_item = tag.iloc[row][1].split(
            ' ')  # 获取实体和实体在文本中的起始位置。 iloc 主要是通过行号获取行数据
        entity_cls, tag_begin, tag_end = tag_item[0], int(tag_item[1]), int(
            tag_item[-1])
        tag_list[tag_begin] = 'B-' + entity_cls  # 起始位置写上'B-'
        for i in range(tag_begin + 1, tag_end):  # 同一个实体类的后面写上'I-'
            tag_list[i] = 'I-' + entity_cls
    assert len([word for sent in texts
                for word in sent]) == len(tag_list)  # 保证每个字都对应一个标签。

    tag_split_list = []
    sen_b = 0
    sen_e = 0
    for s in texts:
        leng = len(s)
        sen_e += leng
        tag_split_list.append(tag_list[sen_b:sen_e])
        sen_b += leng
    data['label'] = tag_split_list

    #**************************************提取词边界和词性特征*************************************
    word_bounds = ['M' for s_tag in tag_list
                   ]  # 保存词的边界(B, E, M, S),刚初始化时,为每一个字都打上'M'的标记。
    word_flags = []  # 保存每个字的词性特征。
    for sent in texts:
        for word, flag in psg.cut(
                sent):  # 中国,ns    成人,n     2,m    型,k     糖尿病,n    HBA1C,eng
            if len(word) == 1:
                start = len(word_flags)
                word_bounds[start] = 'S'
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'
                word_flags += [flag] * len(word)
                end = len(word_flags) - 1
                word_bounds[end] = 'E'

    bounds = []
    flags = []
    sen_b = 0
    sen_e = 0
    for s in texts:
        leng = len(s)
        sen_e += leng
        bounds.append(word_bounds[sen_b:sen_e])
        flags.append(word_flags[sen_b:sen_e])
        sen_b += leng
    data['bound'] = bounds
    data['flag'] = flags
    # return texts[0], tag_split_list[0], bounds[0], flags[0]

    # **************************************获取拼音特征*************************************
    radical = Radical(RunOption.Radical)
    pinyin = Radical(RunOption.Pinyin)

    # 提取每个字的偏旁部首.对于没有偏旁部首的字标上PAD
    data['radical'] = radical_out = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]
    # 提取每个字的拼音.对于没有拼音的字标上PAD
    data['pinyin'] = pinyin_out = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK'
        for x in s
    ] for s in texts]
    # return texts[0], tag_split_list[0], bounds[0], flags[0], data['radical'][0], data['pinyin'][0]

    # ****************************************存储数据****************************************
    num_samples = len(texts)  # 获取总共有多少句话
    num_col = len(data.keys())  # 获取特征项的个数(列数)
    dataset = []
    for i in range(num_samples):
        records = list(zip(*[list(v[i]) for v in data.values()]))
        dataset += records + [['sep'] * num_col]  # 每存完一句话,需要一行sep进行隔离。
    dataset = dataset[:-1]  # 最后一行的分隔符sep不要
    dataset = pd.DataFrame(dataset,
                           columns=data.keys())  # 转换成dataframe(一个表格型的数据结构)

    # f-string用大括号 {} 表示被替换字段,其中直接填入替换内容:
    save_path = f'./wt_pytorch_Medical_BiLSTM_CRF_NER/data/data_preparation/{split_name}/{file_id}.csv'

    def clean_word(word):
        if word == '\n':
            return 'LB'
        if word in [' ', '\t', '\u2003']:
            return 'SPACE'
        # 把所有数字变成一种符号(命名实体识别的任务不关心是什么数字,只知道是数字就可以了)
        # 提高泛化能力。
        if word.isdigit():
            return 'num'
        return word

    dataset['word'] = dataset['word'].apply(clean_word)
    # dataset.to_csv(save_path, index=False, encoding='utf-8')
    dataset.to_csv(save_path, index=False, encoding='utf-8')
Beispiel #8
0
def process_text(idx, target_dir='train', split_method=None):

    assert target_dir in ['train', 'test'], "数据只分训练集测试集"

    data = {}

    # -------------------  get word ---------------------------------------------
    if not split_method:
        with open(f'../data/{target_dir}/{idx}.txt', 'r',
                  encoding='utf-8') as f:
            texts = f.readlines()  # 按行分割
    else:
        with open(f'../data/{target_dir}/{idx}.txt', 'r',
                  encoding='utf-8') as f:
            texts = f.read()
            texts = split_method(texts)  # 自定义分割方法
    data['word'] = texts

    # -------------------  get label ----------------------------------------------
    tag_list = ['PAD' for s in texts for x in s]
    if target_dir == 'train':
        tag = pd.read_csv(f'../data/{target_dir}/{idx}.ann',
                          header=None,
                          sep='\t')
        for i in range(tag.shape[0]):
            tag_item = tag.iloc[i][1].split(' ')
            cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])

            tag_list[start] = 'B-' + cls
            for j in range(start + 1, end):
                tag_list[j] = 'I-' + cls
    assert len([x for s in texts for x in s]) == len(tag_list)
    tags = []
    start = 0
    end = 0
    for item in texts:
        l = len(item)
        end += l
        tags.append(tag_list[start:end])
        start += l

    data['label'] = tags
    # ----------------------------- 词性词边界--------------------------------------------
    word_bounds = ['I' for s in texts for x in s]
    word_flags = []
    # 增加词性与词范围

    for text in texts:
        for word, flag in jieba.posseg.cut(text):
            if len(word) == 1:
                start = len(word_flags)
                word_bounds[start] = 'S'
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'
                word_flags += [flag] * len(word)
                end = len(word_flags) - 1
                word_bounds[end] = 'E'
    data['bound'] = []
    data['pos_tag'] = []
    start = 0
    end = 0
    for item in texts:
        l = len(item)
        end += l
        data['pos_tag'].append(word_flags[start:end])
        data['bound'].append(word_bounds[start:end])
        start += l
    assert len(word_bounds) == len([x for s in texts for x in s])
    assert len(word_flags) == len(word_bounds)
    # --------------------------------  部首  --------------------------------
    from cnradical import Radical, RunOption
    radical = Radical(RunOption.Radical)
    pinyin = Radical(RunOption.Pinyin)
    radical_out = [[
        radical.trans_ch(ele) if radical.trans_ch(ele) is not None else 'PAD'
        for ele in text
    ] for text in texts]
    data['pinyin'] = [[
        pinyin.trans_ch(ele) if pinyin.trans_ch(ele) is not None else 'PAD'
        for ele in text
    ] for text in texts]
    data['radical'] = radical_out

    # --------------------------------------------------------------------------
    num_samples = len(texts)
    num_col = len(data.keys())
    train_file = f'../data/working/{target_dir}/{idx}.csv'

    dataset = []
    for i in range(num_samples):
        records = list(zip(*[list(v[i]) for v in data.values()]))
        dataset += records + [['sep'] * num_col]
    dataset = dataset[:-1]
    dataset = pd.DataFrame(dataset, columns=data.keys())

    def clean_word(w):
        if w == '\n':
            return 'LB'
        elif w in [' ', '\t', '\u2003']:
            return 'SPACE'
        elif w.isdigit():
            return 'num'
        else:
            return w

    dataset['word'] = dataset['word'].apply(clean_word)
    dataset.to_csv(train_file, sep='\t', index=False, encoding='utf8')
def process_text(idx, split_method=None):
    """
    功能: 读取文本并切割,接着打上标记及提取词边界、词性、偏旁部首、拼音等特征
    param idx: 文件的名字 不含扩展名
    param split_method: 切割文本方法
    return
    """

    #定义字典 保存所有字的标记、边界、词性、偏旁部首、拼音等特征
    data = {}

    #--------------------------------------------------------------------
    #获取句子
    #--------------------------------------------------------------------
    if split_method is None:
        #未给文本分割函数 -> 读取文件
        with open(f'data/{train_dir}/{idx}.txt',
                  encoding='utf8') as f:  #f表示文件路径
            texts = f.readlines()
    else:
        #给出文本分割函数 -> 按函数分割
        with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f:
            outfile = f'data/train_data_pro/{idx}_pro.txt'
            print(outfile)
            texts = f.read()
            texts = split_method(texts, outfile)

    #提取句子
    data['word'] = texts
    print(texts)

    #--------------------------------------------------------------------
    #                             获取标签
    #--------------------------------------------------------------------
    #初始时将所有汉字标记为O
    tag_list = ['O' for s in texts for x in s]  #双层循环遍历每句话中的汉字

    #读取ANN文件获取每个实体的类型、起始位置和结束位置
    tag = pd.read_csv(f'data/{train_dir}/{idx}.ann', header=None,
                      sep='\t')  #Pandas读取 分隔符为tab键
    #0 T1 Disease 1845 1850  1型糖尿病

    for i in range(tag.shape[0]):  #tag.shape[0]为行数
        tag_item = tag.iloc[i][1].split(' ')  #每一行的第二列 空格分割
        #print(tag_item)
        #存在某些实体包括两段位置区间 仅获取起始位置和结束位置
        cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        #print(cls,start,end)

        #对tag_list进行修改
        tag_list[start] = 'B-' + cls
        for j in range(start + 1, end):
            tag_list[j] = 'I-' + cls

    #断言 两个长度不一致报错
    assert len([x for s in texts for x in s]) == len(tag_list)
    #print(len([x for s in texts for x in s]))
    #print(len(tag_list))

    #--------------------------------------------------------------------
    #                       分割后句子匹配标签
    #--------------------------------------------------------------------
    tags = []
    start = 0
    end = 0
    #遍历文本
    for s in texts:
        length = len(s)
        end += length
        tags.append(tag_list[start:end])
        start += length
    print(len(tags))
    #标签数据存储至字典中
    data['label'] = tags

    #--------------------------------------------------------------------
    #                       提取词性和词边界
    #--------------------------------------------------------------------
    #初始标记为M
    word_bounds = ['M' for item in tag_list]  #边界 M表示中间
    word_flags = []  #词性

    #分词
    for text in texts:
        #带词性的结巴分词
        for word, flag in psg.cut(text):
            if len(word) == 1:  #1个长度词
                start = len(word_flags)
                word_bounds[start] = 'S'  #单个字
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'  #开始边界
                word_flags += [flag] * len(word)  #保证词性和字一一对应
                end = len(word_flags) - 1
                word_bounds[end] = 'E'  #结束边界
    #存储
    bounds = []
    flags = []
    start = 0
    end = 0
    for s in texts:
        length = len(s)
        end += length
        bounds.append(word_bounds[start:end])
        flags.append(word_flags[start:end])
        start += length
    data['bound'] = bounds
    data['flag'] = flags

    #--------------------------------------------------------------------
    #                             获取拼音特征
    #--------------------------------------------------------------------
    radical = Radical(RunOption.Radical)  #提取偏旁部首
    pinyin = Radical(RunOption.Pinyin)  #提取拼音

    #提取拼音和偏旁 None用特殊符号替代
    radical_out = [[
        radical.trans_ch(x) if radical.trans_ch(x) is not None else 'PAD'
        for x in s
    ] for s in texts]
    pinyin_out = [[
        pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'PAD'
        for x in s
    ] for s in texts]

    #赋值
    data['radical'] = radical_out
    data['pinyin'] = pinyin_out

    #return texts, tags, bounds, flags
    return texts[0], tags[0], bounds[0], flags[0], radical_out[0], pinyin_out[
        0]
def process_text(idx, split_method=None, split_name='train'):
    """
    读取文本,切割,然后打上标记并提取词边界、词性、偏旁部首、拼音等文本特征
    :param idx:文件的名字 不含扩展名
    :param split_method:切割文本的方法
    :param split_name:判断是保存训练集文件还是测试集
    :return:
    """
    data = {}
    # 获取句子
    if split_method is None:
        with open(f'./datas/{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f:
            texts = f.readlines()
    else:
        with open(f'./datas/{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f:
            texts = f.read()
            texts = split_text(texts)
    data['word'] = texts
    # 获取标签,先全部打上O
    tag_list = ['O' for s in texts for _ in s]
    # 读取对应的ann文件
    tag = pd.read_csv(f'./datas/{train_dir}/{idx}.ann', header=None, sep='\t')
    for i in range(tag.shape[0]):
        # 获取实体类别以及起始位置
        tag_item = tag.iloc[i][1].split(' ')
        # 开始打标签
        cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1])
        # 起始实体打上B
        tag_list[start] = 'B-' + cls
        # 其他的打上I
        for j in range(start + 1, end):
            tag_list[j] = 'I-' + cls
    # 做检查长度是否相等
    assert len([x for s in texts for x in s]) == len(tag_list)
    # 提取词性和词边界特征
    word_bounds = ['M' for _ in tag_list]  # 保存每个词的边界
    word_flags = []  # 保存词性
    for text in texts:
        # 遍历带词性的切分
        for word, flag in psg.cut(text):
            # 单个词的时候
            if len(word) == 1:
                start = len(word_flags)
                word_bounds[start] = 'S'
                word_flags.append(flag)
            else:
                start = len(word_flags)
                word_bounds[start] = 'B'
                word_flags += [flag] * len(word)
                # 这里end需要-1
                end = len(word_flags) - 1
                word_bounds[end] = 'E'
    # 这里保存词性,统一截断
    bounds = []
    flags = []
    tags = []
    start = 0
    end = 0
    for s in texts:
        ldx = len(s)
        end += ldx
        # 分句子显示
        bounds.append(word_bounds[start:end])
        flags.append(word_flags[start:end])
        tags.append(tag_list[start:end])
        start += ldx
    data['bound'] = bounds
    data['flag'] = flags
    data['label'] = tags
    # 获取偏旁和拼音特征
    radical = Radical(RunOption.Radical)
    pinyin = Radical(RunOption.Pinyin)
    # 这里循环迭代去获取,None的去填充
    data['radical'] = [[radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s] for s in texts]
    data['pinyin'] = [[pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s] for s in texts]
    # 存储数据
    num_samples = len(texts)
    num_col = len(data.keys())
    dataset = []
    # 获取形如('中', 'B', 'ns', 'O', '丨', 'zhōng'), ('国', 'E', 'ns', 'O', '囗', 'guó')
    for i in range(num_samples):
        recoders = list(zip(*[list(v[i]) for v in data.values()]))  # *是解压的意思
        # 需要加入隔离符号对其隔离
        dataset += recoders + [['sep'] * num_col]
    # 最后一个不要
    dataset = dataset[:-1]
    # 转换成dataframe
    dataset = pd.DataFrame(dataset, columns=data.keys())
    # csv存储路径
    save_path = f'datas/prepare_data/{split_name}/{idx}.csv'

    # 现在开始可以处理换行符了
    def clean_word(w):
        if w == '\n':
            return 'LB'
        if w in [' ', '\t', '\u2003']:
            return 'SPACE'
        # 对所有的数字要统一处理
        if w.isdigit():
            return 'num'
        return w

    dataset['word'] = dataset['word'].apply(clean_word)
    dataset.to_csv(save_path, index=False, encoding='utf-8')