def create_from_ud(cls, data_file_list, split_chars=True): """Initialize corpus from a path to a file in conllu format split_chars: if true, split up multisyllabic words into characters. """ corpus = POSCorpus() corpus.sentences = [] for data_file_path in data_file_list: with open(data_file_path, "r", encoding="utf-8") as data_file: data = data_file.read() data = conllu.parse(data) for token_list in data: sentence = [] for token in token_list: pos = token['upostag'] word = token['form'] if split_chars: for char in word: sentence.append({'char': chinese_converter.to_simplified(char), 'pos': pos}) else: sentence.append({'word': chinese_converter.to_simplified(word), 'pos': pos}) if len(sentence) > 0: corpus.sentences.append(sentence) return corpus
def test_two_way(self): with self.subTest("should get same result when translated both ways"): trad = '皇后與國王在後面共同候車' result = to_traditional(to_simplified(trad)) self.assertEqual(trad, result) simp = '皇后与国王在后面共同候车后' result = to_simplified(to_traditional(simp)) self.assertEqual(simp, result)
def process_data(label, text): data = [] tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) text = re.sub( '[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]', '', text) text_split = re.split('。', text) text_split_join = [] temp = '' for i in text_split: if len(temp) + len(i) < 511: temp += i else: if len(temp) < 511: text_split_join.append(temp) temp = i if temp and len(temp) < 511: text_split_join.append(temp) for text in text_split_join: text = chinese_converter.to_simplified(text) encoded_text = tokenizer(text, padding=True, return_tensors="pt", add_special_tokens=True) encoded_text = np.array(encoded_text["input_ids"])[0] if label[0:2] == '[]': label_cls = [0] * len(encoded_text) else: if type(label) == type(''): label = label.split(', ') label_cls = [0] * len(encoded_text) for i in label: i = i.split('\'') i = chinese_converter.to_simplified(i[1]) encode_i = tokenizer(i, padding=True, return_tensors="pt", add_special_tokens=True) encode_i = np.array(encode_i['input_ids'][0][1:-1]) for j in range(0, len(encoded_text) - len(encode_i)): if (encoded_text[j:j + len(encode_i)] == encode_i).all(): for k in range(0, len(encode_i)): label_cls[j + k] = 1 data.append( (torch.tensor(encoded_text), torch.tensor(np.array(label_cls)))) return data
def make_data(a, b): data = [] model = BertModel.from_pretrained('bert-base-chinese') tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case = True) for i in range(a, b): f = open("training_set/" + str(i) + ".txt", "r") lines = f.readlines() label = lines[0] text = lines[1] if len(lines) == 2 else ''.join(lines[2:]) f.close() text = re.sub('[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]','',text) text_split = re.split('。', text) text_split_join = [] temp = '' for i in text_split: if len(temp) + len(i) < 511: temp += i else: if len(temp) < 511: text_split_join.append(temp) temp = i if temp and len(temp) < 511: text_split_join.append(temp) for text in text_split_join: text = chinese_converter.to_simplified(text) encoded_text = tokenizer(text, padding=True, return_tensors="pt", add_special_tokens=True)#会自动添加ERNIE所需要的特殊token,如[CLS], [SEP] encoded_text = np.array(encoded_text["input_ids"])[0]#[:MAX_SEQLEN] #encoded_text = np.pad(encoded_text, (0, MAX_SEQLEN-len(encoded_text)), mode='constant') # 对所有句子都补长至11000,这样会比较费显存; if label[0:2] == '[]': label_cls = [0] * len(encoded_text) else: if type(label) == type(''): label = label.split(', ') label_cls = [0] * len(encoded_text) for i in label: i = i.split('\'') i = chinese_converter.to_simplified(i[1]) encode_i = tokenizer(i, padding=True, return_tensors="pt", add_special_tokens=True) encode_i = np.array(encode_i['input_ids'][0][1:-1]) for j in range(0, len(encoded_text)-len(encode_i)): if (encoded_text[j:j+len(encode_i)] == encode_i).all(): for k in range(0, len(encode_i)): label_cls[j+k] = 1 data.append((torch.tensor(encoded_text), torch.tensor(np.array(label_cls)))) return data
def inference(text, model): data = [] tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) text_split = re.split('。', text) text_split_join = [] temp = '' for i in text_split: if len(temp) + len(i) < 511: temp += i else: if len(temp) < 511: text_split_join.append(temp) temp = i if temp and len(temp) < 511: text_split_join.append(temp) for text in text_split_join: text = re.sub( '[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]', '', text) text_convert = chinese_converter.to_simplified(text) encoded_text = tokenizer(text_convert, padding=True, return_tensors="pt", add_special_tokens=True) encoded_text = np.array(encoded_text["input_ids"])[0] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") outputs = model(input_ids=torch.tensor([encoded_text]).to((device))) logits = outputs[0][0][1:-1] index = 0 for i in range(0, len(logits)): if logits[i][1] > logits[i][0]: if data and i == index + 1: data[-1] += text[i] else: data.append(text[i]) index = i data = set(data) res = [] for i in data: if 2 <= len(i) <= 4: res.append(i) return res
def testing(text, model): data = [] tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case = True) text_split = re.split('。', text) text_split_join = [] temp = '' for i in text_split: if len(temp) + len(i) < 511: temp += i else: if len(temp) < 511: text_split_join.append(temp) temp = i if temp and len(temp) < 511: text_split_join.append(temp) for text in text_split_join: text = re.sub('[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]','',text) # print(len(text),text) text_convert = chinese_converter.to_simplified(text) encoded_text = tokenizer(text_convert, padding=True, return_tensors="pt", add_special_tokens=True) encoded_text = np.array(encoded_text["input_ids"])[0] # print(encoded_text) # predicted_index = encoded_text # predicted_token = [tokenizer.convert_ids_to_tokens([predicted_index[x]])[0] for x in # range(1, (len(encoded_text) - 1))] # print(predicted_token) outputs = model(input_ids=torch.tensor([encoded_text]).to('cuda:0')) logits = outputs[0][0][1:-1] # print(len(logits)) index = 0 for i in range(0, len(logits)): if logits[i][1] > logits[i][0]: if data and i == index + 1: data[-1] += text[i] else: data.append(text[i]) index = i data = set(data) res = [] for i in data: if 2 <= len(i) <= 4: res.append(i) print(res) return set(data)
def _to_simplified(self, x): return [cc.to_simplified(proc) for proc in x]
def test_to_simplified(self): result = to_simplified('皇后與國王在後面共同候車吃麵') self.assertEqual(result, '皇后与国王在后面共同候车吃面')
def test_other_language(self): text = "this is a book." with self.subTest("should not change text in other languages"): self.assertEqual(text, to_simplified(text)) self.assertEqual(text, to_traditional(text))