def get_radical(word): radical = Radical(RunOption.Radical) rs = [radical.trans_ch(ele) for ele in word] str = '' for r in rs: if r is not None: str += r return str
def process_text(idx, split_method=None, split_name='train'): ''' 读取文本 切割 然后打上标记 并提取边界、词性、 偏旁部首、拼音等文本特征''' data = {} #一个很大的字典,包含很多内容 # ////////////////获取句子/////////////////////////// if split_method is None: with open(f'{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.readlines() else: with open(f'{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.read() #读取整篇文章 texts = split_method(texts) data['word'] = texts # /////////////////获取标签///////////////////////////// tag_list = ['O' for s in texts for x in s] # 双重循环 # print(tag_list) tag = pd.read_csv(f'{train_dir}/{idx}.ann', header=None, sep='\t') for i in range(tag.shape[0]): tag_item = tag.iloc[i][1].split(' ') cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1]) tag_list[start] = 'B-' + cls #tag_list的下标开始改动 for j in range(start + 1, end): tag_list[j] = 'I-' + cls # return tag_list# 只是弄好了一个全文章的标记,但是我们要做成一句话形式的标记匹配 assert len([x for s in texts for x in s]) == len(tag_list) # 保证两个序列长度一致 # text_list ='' # for t in texts: # text_list += t # textes = [] # tags = [] # start = 0 # end = 0 # max = len(tag_list) # for s in texts: # l = len(s) # end += l # tags.append(tag_list[start:end]) # start +=l # data['label'] = tags # 做好标签 # # print(tags,texts) #做好了标签与文本的对应关系 #///////////////提取词性和词边界特征///////////////// word_bounds = ['M' for item in tag_list] #保存词语的分词边界 word_flags = [] for text in texts: for word, flag in psg.cut(text): # word 中国,flag:ns if len(word) == 1: #说明是单个词 start = len(word_flags) word_bounds[start] = 'S' word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' word_flags += [flag] * len(word) end = len(word_flags) - 1 word_bounds[end] = 'E' #////////统一截断/////////////// bounds = [] flags = [] tags = [] start = 0 end = 0 for s in texts: l = len(s) end += l bounds.append(word_bounds[start:end]) flags.append(word_flags[start:end]) tags.append(tag_list[start:end]) start += l data['bound'] = bounds data['flag'] = flags data['label'] = tags # return texts[0], tags[0], bounds[0], flags[0]#此处已经完成了以上四个特征的输出 # /////////////////获取拼音特征和偏旁部首///////////////////// radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) #对于没有偏旁部首的字标上UNK data['radical'] = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] data['pinyin'] = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] # return texts[0], tags[0], bounds[0], flags[0], data['radical'][0], data['pinyin'][0] #数据的几个特征都对应上了。 #//////////////////////////存储数据/////////////////////////////////// num_samples = len(texts) num_col = len(data.keys()) dataset = [] for i in range(num_samples): records = list(zip(*[list(v[i]) for v in data.values()])) #这个符号是解压的意思 dataset += records + [['sep'] * num_col] # 一句话结束后,要用sep进行隔开。 dataset = dataset[:-1] #最后一个sep不要隔开 dataset = pd.DataFrame(dataset, columns=data.keys()) save_path = f'data/{split_name}/{idx}.csv' def clean_word(w): #现在可以去掉空格,标记已经打好了。 if w == '\n': return 'LB' if w in [' ', '\t', '\u2003']: return 'SPACE' if w.isdigit(): return 'NUM' #将数字变成统一的数字 return w dataset['word'] = dataset['word'].apply(clean_word) dataset.to_csv(save_path, index=False, encoding='utf-8')
from cnradical import Radical, RunOption radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) text = '你好,今天早上吃饭了吗?Eastmount' radical_out = [radical.trans_ch(ele) for ele in text] pinyin_out = [pinyin.trans_ch(ele) for ele in text] print(radical_out) print(pinyin_out) radical_out = radical.trans_str(text) pinyin_out = pinyin.trans_str(text) print(radical_out) print(pinyin_out)
def input_from_line_with_feature(line): """ 此函数将单一输入句子进行实体识别,构造为具体如下形式 [[[raw_text]], [[word]], [[bound]], [[flag]], [[label]], [[radical]], [[pinyin]]] 这里多一列,到时候输入为[1:] :param line:输入的单一句子 :param char_to_id:词典转索引 :return: """ with open(f'datas/prepare_data/dict.pkl', 'rb') as f: map_dict = pickle.load(f) def item2id(data, w2i): return [w2i[x] if x in w2i else w2i['UNK'] for x in data] inputs = list() feature_names = ['word', 'bound', 'flag', 'radical', 'pinyin', 'label'] line = full_to_half(line) line = replace_html(line) chars = [[char for char in line]] # 获取标签,先全部打上O tag_list = ['O' for _ in line] # 提取词性和词边界特征 word_bounds = ['M' for _ in tag_list] # 保存每个词的边界 word_flags = [] # 保存词性 # 遍历带词性的切分 for word, flag in psg.cut(line): # 单个词的时候 if len(word) == 1: start = len(word_flags) word_bounds[start] = 'S' word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' word_flags += [flag] * len(word) # 这里end需要-1 end = len(word_flags) - 1 word_bounds[end] = 'E' bounds = [word_bounds] flags = [word_flags] # 由于是测试将label置为空 targets = [[]] # 获取偏旁和拼音特征 radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) # 这里循环迭代去获取,None的去填充 radicals = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in line ]] pinyins = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in line ]] inputs.append(chars) inputs.append(bounds) inputs.append(flags) inputs.append(radicals) inputs.append(pinyins) inputs.append(targets) # 开始循环转化为数字索引 id_inputs = [[line]] for i, feature in enumerate(feature_names): id_inputs.append([item2id(inputs[i][0], map_dict[feature][2])]) return id_inputs[0][0], id_inputs[1:]
def process_text(idx, split_method=None, split_name='train'): """ 功能: 读取文本并切割,接着打上标记及提取词边界、词性、偏旁部首、拼音等特征 param idx: 文件的名字 不含扩展名 param split_method: 切割文本方法 param split_name: 存储数据集 默认训练集, 还有测试集 return """ #定义字典 保存所有字的标记、边界、词性、偏旁部首、拼音等特征 data = {} #-------------------------------------------------------------------- # 获取句子 #-------------------------------------------------------------------- if split_method is None: #未给文本分割函数 -> 读取文件 with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f: #f表示文件路径 texts = f.readlines() else: #给出文本分割函数 -> 按函数分割 with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f: outfile = f'data/train_data_pro/{idx}_pro.txt' print(outfile) texts = f.read() texts = split_method(texts, outfile) #提取句子 data['word'] = texts print(texts) #-------------------------------------------------------------------- # 获取标签(实体类别、起始位置) #-------------------------------------------------------------------- #初始时将所有汉字标记为O tag_list = ['O' for s in texts for x in s] #双层循环遍历每句话中的汉字 #读取ANN文件获取每个实体的类型、起始位置和结束位置 tag = pd.read_csv(f'data/{train_dir}/{idx}.ann', header=None, sep='\t') #Pandas读取 分隔符为tab键 #0 T1 Disease 1845 1850 1型糖尿病 for i in range(tag.shape[0]): #tag.shape[0]为行数 tag_item = tag.iloc[i][1].split(' ') #每一行的第二列 空格分割 #print(tag_item) #存在某些实体包括两段位置区间 仅获取起始位置和结束位置 cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1]) #print(cls,start,end) #对tag_list进行修改 tag_list[start] = 'B-' + cls for j in range(start + 1, end): tag_list[j] = 'I-' + cls #断言 两个长度不一致报错 assert len([x for s in texts for x in s]) == len(tag_list) #print(len([x for s in texts for x in s])) #print(len(tag_list)) #-------------------------------------------------------------------- # 分割后句子匹配标签 #-------------------------------------------------------------------- tags = [] start = 0 end = 0 #遍历文本 for s in texts: length = len(s) end += length tags.append(tag_list[start:end]) start += length print(len(tags)) #标签数据存储至字典中 data['label'] = tags #-------------------------------------------------------------------- # 提取词性和词边界 #-------------------------------------------------------------------- #初始标记为M word_bounds = ['M' for item in tag_list] #边界 M表示中间 word_flags = [] #词性 #分词 for text in texts: #带词性的结巴分词 for word, flag in psg.cut(text): if len(word) == 1: #1个长度词 start = len(word_flags) word_bounds[start] = 'S' #单个字 word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' #开始边界 word_flags += [flag] * len(word) #保证词性和字一一对应 end = len(word_flags) - 1 word_bounds[end] = 'E' #结束边界 #存储 bounds = [] flags = [] start = 0 end = 0 for s in texts: length = len(s) end += length bounds.append(word_bounds[start:end]) flags.append(word_flags[start:end]) start += length data['bound'] = bounds data['flag'] = flags #-------------------------------------------------------------------- # 获取拼音和偏旁特征 #-------------------------------------------------------------------- radical = Radical(RunOption.Radical) #提取偏旁部首 pinyin = Radical(RunOption.Pinyin) #提取拼音 #提取拼音和偏旁 None用特殊符号替代UNK radical_out = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] pinyin_out = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] #赋值 data['radical'] = radical_out data['pinyin'] = pinyin_out #-------------------------------------------------------------------- # 存储数据 #-------------------------------------------------------------------- #获取样本数量 num_samples = len(texts) #行数 num_col = len(data.keys()) #列数 字典自定义类别数 6 print(num_samples) print(num_col) dataset = [] for i in range(num_samples): records = list(zip(*[list(v[i]) for v in data.values()])) #压缩 dataset += records + [['sep'] * num_col] #每处理一句话sep分割 #records = list(zip(*[list(v[0]) for v in data.values()])) #for r in records: # print(r) #最后一行sep删除 dataset = dataset[:-1] #转换成dataframe 增加表头 dataset = pd.DataFrame(dataset, columns=data.keys()) #保存文件 测试集 训练集 save_path = f'data/prepare/{split_name}/{idx}.csv' dataset.to_csv(save_path, index=False, encoding='utf-8') #-------------------------------------------------------------------- # 处理换行符 w表示一个字 #-------------------------------------------------------------------- def clean_word(w): if w == '\n': return 'LB' if w in [' ', '\t', '\u2003']: #中文空格\u2003 return 'SPACE' if w.isdigit(): #将所有数字转换为一种符号 数字训练会造成干扰 return 'NUM' return w #对dataframe应用函数 dataset['word'] = dataset['word'].apply(clean_word) #存储数据 dataset.to_csv(save_path, index=False, encoding='utf-8')
def process_text(idx, split_method=None, split_name='train'): # 用来接收切分好的句子 """ 读取文本---切割---然后打上标记---并提取词边界、词性、部首、拼音、等文本特征 :param idx: 文件的名字,不含扩展名 :param split_method: 切割文本的函数---来自上一个写好的程序 split_name: 最终保存的文件夹名字 :return: """ data = {} #------------------------------切分句子---------------------------------------- if split_method is None: # 如果未将切分函数传给split_method with open(f'../datas/{train_dir}/{idx}.txt', 'r', encoding='utf8') as f: # 打开路径的新写法,读取不同的文件 texts = f.readlines() # 按行,做简单的切割 else: with open(f'../datas/{train_dir}/{idx}.txt', 'r', encoding='utf8') as f: texts = f.read() # 读整篇文章 texts = split_method(texts) # 调用上次的切分函数 data['word'] = texts # 从one中提取切分好的句子,然后作为一个值赋给data字典里['word']这个键 #------------------------------依据.ann人工标注,首先给 每一个字都打上标签---------------------------------------- # 一、 .txt,对每一个字标记为'O' tag_list = ['O' for s in texts for x in s] # s遍历了text里的每一句话,再遍历每一句话里的每一个字---双重循环 # return tag_list # (检查站: main-1用于查看给所有字标记'O') tag = pd.read_csv(f'../datas/{train_dir}/{idx}.ann', header=None, sep='\t') # 读取对应的.ann文件 # 二、 获取.ann人工标注中的类别和边界 for i in range(tag.shape[0]): # 这里是做一个文件里的第一行,即一行一行读 tag_item = tag.iloc[i][1].split(' ') # 取每一行的第二列,获取的实体类别以及起始位置,eg:Disease 18445 1850 # print(tag_item) # (检查站) cls, start, end = tag_item[0], int(tag_item[1]), int( tag_item[-1]) # 字符串转换成整数 # 分别抽取出实体类别,起始位置,终止位置,分别保存在cls,start,end里 # 三、 按照人工标注给已经标注为'O'的根据句子长度,标注B,I标签 tag_list[start] = 'B-' + cls # 每一句话中,给起始位置打B+类别名 for j in range(start + 1, end): # 起始位置之后...到最后的结束为止 tag_list[j] = 'I-' + cls # 后面的位置打I+类别名 assert len([x for s in texts for x in s]) == len(tag_list) # 保证两个序列长度一致,即需要对切分好的每一句话里面的每一个字的长度==打完标记后的每一个字的长度 # texts是切分好的句子,s遍历这些切分好的句子,x再遍历每一个句子的每一个字, # text_list是整篇文章根据人工标注打的bi标签,对起始位置和结束位置标记好的实体结果 # return tag_list # (检查站:可断点查看)* #-----------------------------提取词性和词边界特征---------------------------------------- word_bounds = ['M' for item in tag_list ] # 定义词边界,全部标记为M,text_list是对起始位置和结束位置标记好的实体 word_flags = [] # 用来保存每次切好的词,并给下一次切根据长度提供起始位置 for text in texts: for word, flag in psg.cut(text): # 对每一句话进行带词性切词 if len(word) == 1: # 单个字(词)时 start = len(word_flags) # 确定起始位置,起始位0 word_bounds[start] = 'S' # 对单个的词标注修改为S word_flags.append(flag) # 把标注好的词添加进word_flags else: # 当不是单个词时 start = len(word_flags) # 确定起始位置 word_bounds[start] = 'B' # 每个词的第一个字符标注位B word_flags += [flag] * len(word) # 将这个词的每个字都加上词性标记 end = len(word_flags) - 1 # 确定结束位置 word_bounds[end] = 'E' # 将最后一个字标注位E #---------统一做,本来放在上面,现在和共用一个循环------------------------------------------------- # 需要对text_list同样进行切分,切成texts一样长度的句子,从一整篇文章变成一句话一句话的样子 tags = [] bounds = [] flags = [] start = 0 end = 0 for s in texts: # 一句话一句话遍历整个.txt文本的句子 l = len(s) # 计算每个切分好的句子的长度 end += l # 结束位置 bounds.append( word_bounds[start:end]) # 按照切分好句子的长度对打完标签的txst_list标签进行截取 flags.append(word_flags[start:end]) # 按照切分好句子的长度对打完标签的txst_list标签进行截取 tags.append(tag_list[start:end]) start += l # 定位下一句话的起始位置 data['bound'] = bounds # 词边界特征 data['flag'] = flags # 词性特征 data['label'] = tags # -----------------------------提取拼音特征---------------------------------------- radical = Radical(RunOption.Radical) # 提取偏旁部首 pinyin = Radical(RunOption.Pinyin) # 提取拼音 # 通过库提取偏旁特征,对没有偏旁的标注位 UNK data['radical'] = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] # 列表推导式 # 提取拼音特征,对没有拼音的标注PAD data['pinyin'] = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] # 列表推导式 # return texts[1],tags[1],bounds[1],flags[1],data['radical'][1],data['pinyin'][1] # (检查站) # return (len(data['label'])) #(检查站) #----------------------------------合并和存储切分好的文件--------------------------------------- num_samples = len(texts) # 有多少句话 num_col = len(data.keys()) # 统计有多少列 dataset = [] for i in range(num_samples): records = list( zip(*[list(v[i]) for v in data.values()])) # 将6个列表分别拆分成一个字一个列表,*=解压,无*=压缩 dataset += records + [['sep'] * num_col] # 每存完一个句子需要一行sep进行隔离,sep为元组格式 dataset = dataset[:-1] # 最后一行sep不要 dataset = pd.DataFrame(dataset, columns=data.keys()) # 转换成数据框 save_path = f'../data/prepare/{split_name}/{idx}.csv' #print(dataset) # (检查站) def clean_word(w): # 由于已经标注完成,可以清理换行,空格等符号 if w == '\n': # 换行 return 'LB' if w in [' ', '\t', '\u2003']: # 空格,TAB,中文空格 return 'SPACE' if w.isdigit(): # 命名实体识别不关心数字是多少,所以把数字变成一个类 return 'num' return w dataset['word'] = dataset['word'].apply(clean_word) # 存储之前进行清洗 dataset.to_csv(save_path, index=False, encoding='utf-8')
def process_text(file_id, split_method=None, split_name='train_small'): """ 本函数作用:用于读取文本、切割、打标记 并提取词边界、词性、偏旁部首和拼音等文本特征。 :param file_name: 文件的名字,不含扩展名(.txt 和 .ann) :param split_method: 切割文本的方法(是一个函数) :param split_name: 最终保存的文件夹的名字 :return: """ data = {} # 用来存所有的字,标记,边界,偏旁部首,拼音等。 # ****************************************获取句子******************************************** if split_method == None: # 如果没有切割文本的方法,就一行当做一个句子处理。 with open(f'{train_dir}/{file_id}.txt', encoding='utf-8') as f: texts = f.readlines() else: with open(f'{train_dir}/{file_id}.txt', encoding='utf-8') as f: texts = f.read() texts = split_method(texts) # print(texts) data['word'] = texts # ****************************************获取标签********************************************* tag_list = ['O' for sent in texts for word in sent] tag = pd.read_csv(f'{train_dir}/{file_id}.ann', header=None, sep='\t') for row in range(tag.shape[0]): tag_item = tag.iloc[row][1].split( ' ') # 获取实体和实体在文本中的起始位置。 iloc 主要是通过行号获取行数据 entity_cls, tag_begin, tag_end = tag_item[0], int(tag_item[1]), int( tag_item[-1]) tag_list[tag_begin] = 'B-' + entity_cls # 起始位置写上'B-' for i in range(tag_begin + 1, tag_end): # 同一个实体类的后面写上'I-' tag_list[i] = 'I-' + entity_cls assert len([word for sent in texts for word in sent]) == len(tag_list) # 保证每个字都对应一个标签。 tag_split_list = [] sen_b = 0 sen_e = 0 for s in texts: leng = len(s) sen_e += leng tag_split_list.append(tag_list[sen_b:sen_e]) sen_b += leng data['label'] = tag_split_list #**************************************提取词边界和词性特征************************************* word_bounds = ['M' for s_tag in tag_list ] # 保存词的边界(B, E, M, S),刚初始化时,为每一个字都打上'M'的标记。 word_flags = [] # 保存每个字的词性特征。 for sent in texts: for word, flag in psg.cut( sent): # 中国,ns 成人,n 2,m 型,k 糖尿病,n HBA1C,eng if len(word) == 1: start = len(word_flags) word_bounds[start] = 'S' word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' word_flags += [flag] * len(word) end = len(word_flags) - 1 word_bounds[end] = 'E' bounds = [] flags = [] sen_b = 0 sen_e = 0 for s in texts: leng = len(s) sen_e += leng bounds.append(word_bounds[sen_b:sen_e]) flags.append(word_flags[sen_b:sen_e]) sen_b += leng data['bound'] = bounds data['flag'] = flags # return texts[0], tag_split_list[0], bounds[0], flags[0] # **************************************获取拼音特征************************************* radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) # 提取每个字的偏旁部首.对于没有偏旁部首的字标上PAD data['radical'] = radical_out = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] # 提取每个字的拼音.对于没有拼音的字标上PAD data['pinyin'] = pinyin_out = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s ] for s in texts] # return texts[0], tag_split_list[0], bounds[0], flags[0], data['radical'][0], data['pinyin'][0] # ****************************************存储数据**************************************** num_samples = len(texts) # 获取总共有多少句话 num_col = len(data.keys()) # 获取特征项的个数(列数) dataset = [] for i in range(num_samples): records = list(zip(*[list(v[i]) for v in data.values()])) dataset += records + [['sep'] * num_col] # 每存完一句话,需要一行sep进行隔离。 dataset = dataset[:-1] # 最后一行的分隔符sep不要 dataset = pd.DataFrame(dataset, columns=data.keys()) # 转换成dataframe(一个表格型的数据结构) # f-string用大括号 {} 表示被替换字段,其中直接填入替换内容: save_path = f'./wt_pytorch_Medical_BiLSTM_CRF_NER/data/data_preparation/{split_name}/{file_id}.csv' def clean_word(word): if word == '\n': return 'LB' if word in [' ', '\t', '\u2003']: return 'SPACE' # 把所有数字变成一种符号(命名实体识别的任务不关心是什么数字,只知道是数字就可以了) # 提高泛化能力。 if word.isdigit(): return 'num' return word dataset['word'] = dataset['word'].apply(clean_word) # dataset.to_csv(save_path, index=False, encoding='utf-8') dataset.to_csv(save_path, index=False, encoding='utf-8')
def process_text(idx, target_dir='train', split_method=None): assert target_dir in ['train', 'test'], "数据只分训练集测试集" data = {} # ------------------- get word --------------------------------------------- if not split_method: with open(f'../data/{target_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.readlines() # 按行分割 else: with open(f'../data/{target_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.read() texts = split_method(texts) # 自定义分割方法 data['word'] = texts # ------------------- get label ---------------------------------------------- tag_list = ['PAD' for s in texts for x in s] if target_dir == 'train': tag = pd.read_csv(f'../data/{target_dir}/{idx}.ann', header=None, sep='\t') for i in range(tag.shape[0]): tag_item = tag.iloc[i][1].split(' ') cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1]) tag_list[start] = 'B-' + cls for j in range(start + 1, end): tag_list[j] = 'I-' + cls assert len([x for s in texts for x in s]) == len(tag_list) tags = [] start = 0 end = 0 for item in texts: l = len(item) end += l tags.append(tag_list[start:end]) start += l data['label'] = tags # ----------------------------- 词性词边界-------------------------------------------- word_bounds = ['I' for s in texts for x in s] word_flags = [] # 增加词性与词范围 for text in texts: for word, flag in jieba.posseg.cut(text): if len(word) == 1: start = len(word_flags) word_bounds[start] = 'S' word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' word_flags += [flag] * len(word) end = len(word_flags) - 1 word_bounds[end] = 'E' data['bound'] = [] data['pos_tag'] = [] start = 0 end = 0 for item in texts: l = len(item) end += l data['pos_tag'].append(word_flags[start:end]) data['bound'].append(word_bounds[start:end]) start += l assert len(word_bounds) == len([x for s in texts for x in s]) assert len(word_flags) == len(word_bounds) # -------------------------------- 部首 -------------------------------- from cnradical import Radical, RunOption radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) radical_out = [[ radical.trans_ch(ele) if radical.trans_ch(ele) is not None else 'PAD' for ele in text ] for text in texts] data['pinyin'] = [[ pinyin.trans_ch(ele) if pinyin.trans_ch(ele) is not None else 'PAD' for ele in text ] for text in texts] data['radical'] = radical_out # -------------------------------------------------------------------------- num_samples = len(texts) num_col = len(data.keys()) train_file = f'../data/working/{target_dir}/{idx}.csv' dataset = [] for i in range(num_samples): records = list(zip(*[list(v[i]) for v in data.values()])) dataset += records + [['sep'] * num_col] dataset = dataset[:-1] dataset = pd.DataFrame(dataset, columns=data.keys()) def clean_word(w): if w == '\n': return 'LB' elif w in [' ', '\t', '\u2003']: return 'SPACE' elif w.isdigit(): return 'num' else: return w dataset['word'] = dataset['word'].apply(clean_word) dataset.to_csv(train_file, sep='\t', index=False, encoding='utf8')
def process_text(idx, split_method=None): """ 功能: 读取文本并切割,接着打上标记及提取词边界、词性、偏旁部首、拼音等特征 param idx: 文件的名字 不含扩展名 param split_method: 切割文本方法 return """ #定义字典 保存所有字的标记、边界、词性、偏旁部首、拼音等特征 data = {} #-------------------------------------------------------------------- #获取句子 #-------------------------------------------------------------------- if split_method is None: #未给文本分割函数 -> 读取文件 with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f: #f表示文件路径 texts = f.readlines() else: #给出文本分割函数 -> 按函数分割 with open(f'data/{train_dir}/{idx}.txt', encoding='utf8') as f: outfile = f'data/train_data_pro/{idx}_pro.txt' print(outfile) texts = f.read() texts = split_method(texts, outfile) #提取句子 data['word'] = texts print(texts) #-------------------------------------------------------------------- # 获取标签 #-------------------------------------------------------------------- #初始时将所有汉字标记为O tag_list = ['O' for s in texts for x in s] #双层循环遍历每句话中的汉字 #读取ANN文件获取每个实体的类型、起始位置和结束位置 tag = pd.read_csv(f'data/{train_dir}/{idx}.ann', header=None, sep='\t') #Pandas读取 分隔符为tab键 #0 T1 Disease 1845 1850 1型糖尿病 for i in range(tag.shape[0]): #tag.shape[0]为行数 tag_item = tag.iloc[i][1].split(' ') #每一行的第二列 空格分割 #print(tag_item) #存在某些实体包括两段位置区间 仅获取起始位置和结束位置 cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1]) #print(cls,start,end) #对tag_list进行修改 tag_list[start] = 'B-' + cls for j in range(start + 1, end): tag_list[j] = 'I-' + cls #断言 两个长度不一致报错 assert len([x for s in texts for x in s]) == len(tag_list) #print(len([x for s in texts for x in s])) #print(len(tag_list)) #-------------------------------------------------------------------- # 分割后句子匹配标签 #-------------------------------------------------------------------- tags = [] start = 0 end = 0 #遍历文本 for s in texts: length = len(s) end += length tags.append(tag_list[start:end]) start += length print(len(tags)) #标签数据存储至字典中 data['label'] = tags #-------------------------------------------------------------------- # 提取词性和词边界 #-------------------------------------------------------------------- #初始标记为M word_bounds = ['M' for item in tag_list] #边界 M表示中间 word_flags = [] #词性 #分词 for text in texts: #带词性的结巴分词 for word, flag in psg.cut(text): if len(word) == 1: #1个长度词 start = len(word_flags) word_bounds[start] = 'S' #单个字 word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' #开始边界 word_flags += [flag] * len(word) #保证词性和字一一对应 end = len(word_flags) - 1 word_bounds[end] = 'E' #结束边界 #存储 bounds = [] flags = [] start = 0 end = 0 for s in texts: length = len(s) end += length bounds.append(word_bounds[start:end]) flags.append(word_flags[start:end]) start += length data['bound'] = bounds data['flag'] = flags #-------------------------------------------------------------------- # 获取拼音特征 #-------------------------------------------------------------------- radical = Radical(RunOption.Radical) #提取偏旁部首 pinyin = Radical(RunOption.Pinyin) #提取拼音 #提取拼音和偏旁 None用特殊符号替代 radical_out = [[ radical.trans_ch(x) if radical.trans_ch(x) is not None else 'PAD' for x in s ] for s in texts] pinyin_out = [[ pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'PAD' for x in s ] for s in texts] #赋值 data['radical'] = radical_out data['pinyin'] = pinyin_out #return texts, tags, bounds, flags return texts[0], tags[0], bounds[0], flags[0], radical_out[0], pinyin_out[ 0]
def process_text(idx, split_method=None, split_name='train'): """ 读取文本,切割,然后打上标记并提取词边界、词性、偏旁部首、拼音等文本特征 :param idx:文件的名字 不含扩展名 :param split_method:切割文本的方法 :param split_name:判断是保存训练集文件还是测试集 :return: """ data = {} # 获取句子 if split_method is None: with open(f'./datas/{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.readlines() else: with open(f'./datas/{train_dir}/{idx}.txt', 'r', encoding='utf-8') as f: texts = f.read() texts = split_text(texts) data['word'] = texts # 获取标签,先全部打上O tag_list = ['O' for s in texts for _ in s] # 读取对应的ann文件 tag = pd.read_csv(f'./datas/{train_dir}/{idx}.ann', header=None, sep='\t') for i in range(tag.shape[0]): # 获取实体类别以及起始位置 tag_item = tag.iloc[i][1].split(' ') # 开始打标签 cls, start, end = tag_item[0], int(tag_item[1]), int(tag_item[-1]) # 起始实体打上B tag_list[start] = 'B-' + cls # 其他的打上I for j in range(start + 1, end): tag_list[j] = 'I-' + cls # 做检查长度是否相等 assert len([x for s in texts for x in s]) == len(tag_list) # 提取词性和词边界特征 word_bounds = ['M' for _ in tag_list] # 保存每个词的边界 word_flags = [] # 保存词性 for text in texts: # 遍历带词性的切分 for word, flag in psg.cut(text): # 单个词的时候 if len(word) == 1: start = len(word_flags) word_bounds[start] = 'S' word_flags.append(flag) else: start = len(word_flags) word_bounds[start] = 'B' word_flags += [flag] * len(word) # 这里end需要-1 end = len(word_flags) - 1 word_bounds[end] = 'E' # 这里保存词性,统一截断 bounds = [] flags = [] tags = [] start = 0 end = 0 for s in texts: ldx = len(s) end += ldx # 分句子显示 bounds.append(word_bounds[start:end]) flags.append(word_flags[start:end]) tags.append(tag_list[start:end]) start += ldx data['bound'] = bounds data['flag'] = flags data['label'] = tags # 获取偏旁和拼音特征 radical = Radical(RunOption.Radical) pinyin = Radical(RunOption.Pinyin) # 这里循环迭代去获取,None的去填充 data['radical'] = [[radical.trans_ch(x) if radical.trans_ch(x) is not None else 'UNK' for x in s] for s in texts] data['pinyin'] = [[pinyin.trans_ch(x) if pinyin.trans_ch(x) is not None else 'UNK' for x in s] for s in texts] # 存储数据 num_samples = len(texts) num_col = len(data.keys()) dataset = [] # 获取形如('中', 'B', 'ns', 'O', '丨', 'zhōng'), ('国', 'E', 'ns', 'O', '囗', 'guó') for i in range(num_samples): recoders = list(zip(*[list(v[i]) for v in data.values()])) # *是解压的意思 # 需要加入隔离符号对其隔离 dataset += recoders + [['sep'] * num_col] # 最后一个不要 dataset = dataset[:-1] # 转换成dataframe dataset = pd.DataFrame(dataset, columns=data.keys()) # csv存储路径 save_path = f'datas/prepare_data/{split_name}/{idx}.csv' # 现在开始可以处理换行符了 def clean_word(w): if w == '\n': return 'LB' if w in [' ', '\t', '\u2003']: return 'SPACE' # 对所有的数字要统一处理 if w.isdigit(): return 'num' return w dataset['word'] = dataset['word'].apply(clean_word) dataset.to_csv(save_path, index=False, encoding='utf-8')