def load_sentences(path, lower, zero): """ 加载训练样本,一句话就是一个样本。 训练样本中,每一行是这样的:长 B-Dur,即字和对应的标签 句子之间使用空行隔开的 return : sentences: [[['无', 'O'], ['长', 'B-Dur'], ['期', 'I-Dur'],...]] """ sentences = [] sentence = [] for line in open(path, 'r', encoding='utf8'): """ 如果包含有数字,就把每个数字用0替换 """ line = line.rstrip() line = zero_digits(line) if zero else line """ 如果不是句子结束的换行符,就继续添加单词到句子中 """ if line: word_pair = ["<unk>", line[2:]] if line[0] == " " else line.split() assert len(word_pair) == 2 sentence.append(word_pair) else: """ 如果遇到换行符,说明一个句子处理完毕 """ if len(sentence) > 0: sentences.append(sentence) sentence = [] """ 最后一个句子没有换行符,处理好后,直接添加到样本集中 """ if len(sentence) > 0: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num+=1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word= line.split() assert len(word) >= 2, print([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. :param path: the data path :param lower: whether lower case(not use) :param zeros: whether replace digits with zero :return: [num_sents, seq_len, 2] """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() assert len(word) >= 2, print([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): sentences = [] sentence = [] num = 0 for line in open(path, 'r', encoding='utf8'): num += 1 # 在这里将line中的数字(正则表达式是\d)转换成0 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) # 从该处开始对数据进行校验 # 如果文档内容为空返回一个空语句 if not line: if len(sentence) > 0: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: # 在该处分割每个单词,为该函数的主要操作内容 word = line.split() assert len(word) == 2 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() assert len(word) >= 2, print([word]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num+=1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word= line.split() #assert len(word) >= 2, print([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zero): """ 加载数据集,一行至少包含一个词和对应的标签 :param path: :param lower: :param zero: :return: """ sentences = [] sentence = [] # 每一行去读 for line in codecs.open(path, 'r', encoding='utf-8'): # 此处一定要将各种数字转化为0,这样就能泛化识别 line = zero_digits(line.rstrip()) if zero else line.rstrip() if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() if len(word) == 2: sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] f = open(path, encoding='utf-8') for line in f: line_list = line.strip().split(" ") if zeros: line_list[0] = zero_digits(line_list[0]) if len(line_list) == 2: sentence.append(line_list) else: sentences.append(sentence) sentence = [] return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # 即根据zeros的真假确定line的赋值,True则赋值为:zero_digits(line.rstrip()), 反之则赋值为line.rstrip() # str.rstrip([chars]); chars -- 指定删除的字符(默认为空格); 返回删除 string 字符串末尾的指定字符后生成的新字符串。 # 而对于txt中的“\n”而言,对它进行rstrip操作会把它变成[], 从而方便把句子区分开 if not line: # 空列表相当于 False if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) # append() 方法用于在列表末尾添加新的对象。 sentence = [] # 这里的操作是利用回车给句子分段,并分别将每个句子存在sentences里 else: if line[0] == " ": line = "$" + line[1:] word = line.split() # 感觉这里是把空格行用“$”标注的样子 else: word = line.split() # str.split(str="", num=string.count(str)) # 通过指定分隔符对字符串进行切片,如果参数num 有指定值,则仅分隔 num 个子字符串 # str -- 分隔符,默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等; num -- 分割次数。 # 这里会将'海 0\n'分成['海','0'] assert len(word) >= 2, print([word[0]]) # 不满足assert条件报错 # 如果word的长度小于2 则会打印word首字然后报Assertion的错 即认为一个词对应一个标签 sentence.append(word) # sentence 存储的是每个词签对的列表 # 这个循环操作是把文档逐字地进行读取,并读取每个字的标签 if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros, data_augment=True): """ 读取训练数据 数据文件格式如下: 如/O 何/O 演/O 好/O 自/O 己/O 的/O 角/O 色/O ,/O 请/O 读/O 《/O 演/O 员/O 自/O 我/O 修/O 养/O 》/O :param path: 数据文件 :param lower: :param zeros: :param data_augment: 是否需要数据增强; :return: """ sentences = [] for line in codecs.open(path, 'r', 'utf8'): line = zero_digits(line.rstrip()) if zeros else line.rstrip() sentence = [[word[0], word[2:]] for word in line.split() if word[1] == '/'] if sentence: sentences.append(sentence) if data_augment: sentences = data_augmentation(sentences) return sentences
def load_sentences(path, lower, zeros): """ 加载数据集中的语句,将语句中的字符及对应的标签存储为列表,然后每个语句又单独形成一个列表 :param path:数据集路径 :param lower:是否将英文字符小写 :param zeros:是否将数字全赋值为0 :return: """ sentences = [] # 存储所有语句 sentence = [] # 存储一个语句的所有字符及相应的标签 num = 0 for line in codecs.open(path, 'r', 'utf8'): # print(line) num += 1 # 根据zero参数的值决定是否将所有的数字设为0 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: # 每句话结束的时候将sentence添加到sentences中 sentences.append(sentence) sentence = [] else: if line[0] == " ": line = "$" + line[1:] # 将每个词与相应的标注存储为一个数组word word = line.split() # word[0] = " " else: word = line.split() # assert len(word) >= 2, print([word[0]]) # 若训练数据每一行只有一个字符串,则报错(因为每一行应该是word+标签) if len(word) == 1: word.append("O") # 每个word数组添加到sentence中 sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in open(path, 'r', encoding='utf8'): #其中一行,如“长 B-Dur” # if num == 18053727 or num == 18053726: # print(line) # t = line num += 1 # line = zero_digits(line.rstrip()) if zeros else line.rstrip() #若忽略大小写,则就处理,rstrip()表示删除 string 字符串末尾的指定字符(默认为空格) line = zero_digits(line.replace(' ', '')) if zeros else line.rstrip( ) #若忽略大小写,则就处理,rstrip()表示删除 string 字符串末尾的指定字符(默认为空格) # print(list(line)) if not line: #Sentences are separated by empty lines. 若处理到空行,则保存之前处理过的句子 if len(sentence) > 0: sentences.append(sentence) sentence = [] else: if line[0] == " ": #若出现“ O”这种情况 line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() # if len(word) != 2: # print(sentence) # print(word) # print(num) # print(t) assert len(word) == 2 sentence.append(word) if len(sentence) > 0: #保存最后一句话处理的结果 if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 for line in codecs.open(path, 'r', 'utf8'): num += 1 line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print("line:{}".format(list(line))) # print("line:{}".format(line)) if not line: if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) # print("sentence:{}".format(sentence)) # "sentence:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]" sentence = [] else: if line[0] == " ": line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() # print("word:{}".format(word)) # word: ['监', 'B-TYPE'] assert len(word) >= 2, print([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) # print("sentences:{}".format(sentences)) # print("sentences[0]:{}".format(sentences[0])) # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]" return sentences
def load_sentences(path, lower=False, zeros=False): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ sentences = [] sentence = [] num = 0 with codecs.open(path, 'r', 'utf8') as fread: # n_lines = len(fread) print("Read from {:s}".format(path)) # pbar = progressbar.ProgressBar(max_value=n_lines) for line_idx, line in enumerate(fread): assert line_idx == num, 'ER' num += 1 # pbar.update(line_idx) line = zero_digits(line.rstrip()) if zeros else line.rstrip() # print(list(line)) if not line: #Update: only deal with space between sentences if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: # remove the DOCstart sentences.append(sentence) sentence = [] else: if line[0] == " ": #Update: this part is never used in Chinese ner! line = "$" + line[1:] word = line.split() # word[0] = " " else: word = line.split() assert len(word) >= 2, ([word[0]]) sentence.append(word) if len(sentence) > 0: if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences
def load_sentences(path, lower, zeros): """ Load sentences. A line must contain at least a word and its tag. Sentences are separated by empty lines. """ #������path 'e:\\objectTest\\NERuselocal\\data\\example.train'ѵ��(�Ѿ���־)�ļ�·�� lower ��ĸת��Сд�� zeros����ת��0 #���ؾ��ӡ�һ�б������ٰ���һ�����ʺ����ı�ǩ�������ÿ��зָ� sentences = [] #����������ݼ���������ӵ���� sentence = [] #��ŵ���һ������ num = 0 for line in open( path, 'r', encoding='utf8' ): #path 'e:\\objectTest\\NERuselocal\\data\\example.train' num += 1 #ɾ�� string �ַ���ĩβ��ָ���ַ���Ĭ��Ϊ�ո� line = zero_digits(line.rstrip()) if zeros else line.rstrip( ) #���������ȥ��β���ո�ת��Ϊ0 ����ȥ��β���ո� #1��ѭ�� '�� O' '�� B-Dur' '�� I-Dur' # print(list(line)) if not line: #������У����ӽ��� ���뵽sentences sentence���»�Ϊ�� if len(sentence) > 0: sentences.append(sentence) sentence = [] else: if line[0] == " ": #����ǿ��ַ� ��"$"���� line = "$" + line[1:] word = line.split() # word[0] = " " ��ȡÿ���֣���Ӧ�ı�Ǵ���list ['��', 'O'] Ȼ����뵽 sentence else: word = line.split( ) #['��', 'O'] ['��', 'B-Dur'] ['��', 'I-Dur'] assert len(word) == 2 sentence.append( word) #[['��', 'O'], ['��', 'B-Dur'],['��', 'I-Dur']] if len(sentence) > 0: #if 'DOCSTART' not in sentence[0][0]: sentences.append(sentence) return sentences