class NERDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, sents_src, sents_tgt): ## 一般init函数是加载所有数据 super(NERDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.sents_src = sents_src self.sents_tgt = sents_tgt self.idx2word = {k: v for v, k in word2idx.items()} self.tokenizer = Tokenizer(word2idx) def __getitem__(self, i): ## 得到单个数据 # print(i) src = self.sents_src[i] tgt = self.sents_tgt[i] token_ids, token_type_ids = self.tokenizer.encode(src) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, "target_id": tgt } return output def __len__(self): return len(self.sents_src)
class BertDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, data) : ## 一般init函数是加载所有数据 super(BertDataset, self).__init__() self.data = data print("data size is " + str(len(data))) self.idx2word = {k: v for v, k in word2idx.items()} self.tokenizer = Tokenizer(word2idx) def __getitem__(self, i): ## 得到单个数据 # print(i) single_data = self.data[i] original_text = single_data[0] ans_text = single_data[1] token_ids, token_type_ids = self.tokenizer.encode( original_text, ans_text, max_length=maxlen ) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output def __len__(self): return len(self.data)
class BertDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, sents_src, sents_tgt, vocab_path): ## 一般init函数是加载所有数据 super(BertDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.sents_src = sents_src self.sents_tgt = sents_tgt self.word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx) def __getitem__(self, i): ## 得到单个数据 src = self.sents_src[i] tgt = self.sents_tgt[i] token_ids, token_type_ids = self.tokenizer.encode(src, tgt) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output def __len__(self): return len(self.sents_src)
class BertDataset(Dataset): def __init__(self): super(BertDataset, self).__init__() self.sents_src = read_file( "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.src" ) self.sents_tgt = read_file( "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.tgt" ) self.sents_src = self.sents_src.split('\n') self.sents_tgt = self.sents_tgt.split('\n') self.idx2word = {k: v for v, k in word2idx.items()} self.tokenizer = Tokenizer(word2idx) def __getitem__(self, i): title = self.sents_tgt[i] content = self.sents_src[i] token_ids, token_type_ids = self.tokenizer.encode(content, title, max_length=maxlen) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output self.__getitem__(i + 1) def __len__(self): data_size = len(self.sents_src) return data_size
def ner_print(model, test_data, vocab_path, device="cpu"): model.eval() word2idx = load_chinese_base_vocab(vocab_path) tokenier = Tokenizer(word2idx) trans = model.state_dict()["crf_layer.trans"] for text in test_data: decode = [] text_encode, text_ids = tokenier.encode(text) text_tensor = torch.tensor(text_encode, device=device).view(1, -1) out = model(text_tensor).squeeze(0) # 其实是nodes labels = viterbi_decode(out, trans) starting = False for l in labels: if l > 0: label = target[l.item()] decode.append(label) else : decode.append("other") flag = 0 res = {} for index, each_entity in enumerate(decode): if each_entity != "other": if flag != each_entity: cur_text = text[index - 1] if each_entity in res.keys(): res[each_entity].append(cur_text) else : res[each_entity] = [cur_text] flag = each_entity elif flag == each_entity: res[each_entity][-1] += text[index - 1] else : flag = 0 print(res)
def read_corpus(dir_path, vocab_path): """ 读原始数据 """ sents_src = [] sents_tgt = [] word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) files = os.listdir(dir_path) #得到文件夹下的所有文件名称 for file1 in files: #遍历文件夹 if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开 file_path = dir_path + "/" + file1 print(file_path) if file_path[-3:] != "csv": continue df = pd.read_csv(file_path) # 先判断诗句的类型 再确定是否要构造数据 for index, row in df.iterrows(): if type(row[0]) is not str or type(row[3]) is not str: continue if len(row[0].split(" ")) > 1: # 说明题目里面存在空格,只要空格前面的数据 row[0] = row[0].split(" ")[0] if len(row[0]) > 10 or len(row[0]) < 1: # 过滤掉题目长度过长和过短的诗句 continue encode_text = tokenizer.encode(row[3])[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue if len(row[3]) == 24 and (row[3][5] == "," or row[3][5] == "。"): # 五言绝句 sents_src.append(row[0] + "##" + "五言绝句") sents_tgt.append(row[3]) elif len(row[3]) == 32 and (row[3][7] == "," or row[3][7] == "。"): # 七言绝句 sents_src.append(row[0] + "##" + "七言绝句") sents_tgt.append(row[3]) elif len(row[3]) == 48 and (row[3][5] == "," or row[3][5] == "。"): # 五言律诗 sents_src.append(row[0] + "##" + "五言律诗") sents_tgt.append(row[3]) elif len(row[3]) == 64 and (row[3][7] == "," or row[3][7] == "。"): # 七言律诗 sents_src.append(row[0] + "##" + "七言律诗") sents_tgt.append(row[3]) print("第一个诗句数据集共: " + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def read_corpus_2(dir_path, vocab_path): """读取最近的一个数据集 唐诗和宋诗 """ sents_src = [] sents_tgt = [] word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) files = os.listdir(dir_path) #得到文件夹下的所有文件名称 for file1 in files: #遍历文件夹 if not os.path.isdir(file1): #判断是否是文件夹,不是文件夹才打开 file_path = dir_path + "/" + file1 print(file_path) # data = json.load(file_path) with open(file_path) as f: poem_list = eval(f.read()) for each_poem in poem_list: string_list = each_poem["paragraphs"] poem = "" for each_s in string_list: poem += each_s cc = opencc.OpenCC('t2s') poem = cc.convert(poem) encode_text = tokenizer.encode(poem)[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue title = cc.convert(each_poem["title"]) if len(title) > 10 or len(title) < 1: # 过滤掉题目长度过长和过短的诗句 continue if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"): # 五言绝句 sents_src.append(title + "##" + "五言绝句") sents_tgt.append(poem) elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"): # 七言绝句 sents_src.append(title + "##" + "七言绝句") sents_tgt.append(poem) elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"): # 五言律诗 sents_src.append(title + "##" + "五言律诗") sents_tgt.append(poem) elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"): # 七言律诗 sents_src.append(title + "##" + "七言律诗") sents_tgt.append(poem) print("第二个诗句数据集共:" + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def read_corpus_ci(dir_path, vocab_path): """ 读取宋词数据集""" import json, sys import sqlite3 from collections import OrderedDict word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) tokenizer = Tokenizer(word2idx) try: # Python 2 reload(sys) sys.setdefaultencoding('utf-8') except NameError: # Python 3 pass c = sqlite3.connect(dir_path + '/ci.db') cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;") d = {"name": None, "description": None, "short_description": None} cursor = c.execute("SELECT rhythmic, author, content from ci;") d = {"rhythmic": None, "author": None, "paragraphs": None} # cis = [] sents_src = [] sents_tgt = [] for row in cursor: ci = OrderedDict(sorted(d.items(), key=lambda t: t[0])) ci["rhythmic"] = row[0] ci["author"] = row[1] ci["paragraphs"] = row[2].split('\n') string = "" for s in ci["paragraphs"]: if s == " >> " or s == "词牌介绍": continue string += s encode_text = tokenizer.encode(string)[0] if word2idx["[UNK]"] in encode_text: # 过滤unk字符 continue sents_src.append(row[0] + "##词") sents_tgt.append(string) # cis.append(ci) # print(cis[:10]) print("词共: " + str(len(sents_src)) + "篇") return sents_src, sents_tgt
def ner_print(model, test_data): model.eval() idxtword = {v: k for k, v in word2idx.items()} tokenier = Tokenizer(word2idx) trans = model.state_dict()["crf_layer.trans"] for text in test_data: decode = [] text_encode, text_ids = tokenier.encode(text) text_tensor = torch.tensor(text_encode, device=model.device).view(1, -1) out = model(text_tensor).squeeze(0) # 其实是nodes labels = viterbi_decode(out, trans) starting = False for l in labels: if l > 0: label = target[l.item()] decode.append(label) else : decode.append("O") flag = 0 res = {} # print(decode) # print(text) decode_text = [idxtword[i] for i in text_encode] for index, each_entity in enumerate(decode): if each_entity != "O": if flag != each_entity: cur_text = decode_text[index] if each_entity in res.keys(): res[each_entity].append(cur_text) else : res[each_entity] = [cur_text] flag = each_entity elif flag == each_entity: res[each_entity][-1] += decode_text[index] else : flag = 0 print(res)
class BertDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self): ## 一般init函数是加载所有数据 super(BertDataset, self).__init__() ## 拿到所有文件名字 self.txts = glob.glob('./state_dict/THUCNews/*/*.txt') self.idx2word = {k: v for v, k in word2idx.items()} self.tokenizer = Tokenizer(word2idx) def __getitem__(self, i): ## 得到单个数据 # print(i) text_name = self.txts[i] with open(text_name, "r", encoding="utf-8") as f: text = f.read() text = text.split('\n') if len(text) > 1: title = text[0] content = '\n'.join(text[1:]) token_ids, token_type_ids = self.tokenizer.encode( content, title, max_length=maxlen) output = { "token_ids": token_ids, "token_type_ids": token_type_ids, } return output self.__getitem__(i + 1) def __len__(self): return len(self.txts)
import sys sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq") from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert target = ["0", "1"] cls_model = "./state_dict/bert_semantic_matching.bin" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name, model_class="cls", target_size=len(target)) bert_model.set_device(device) bert_model.eval() ## 加载训练的模型参数~ bert_model.load_all_params(model_path=cls_model, device=device) test_data = ["你是不是我仇人#你是俺的仇人吗", "这个就没意思了#我没别的意思", "查一下我的家在哪里#家在哪里?"] for text in test_data: with torch.no_grad(): text_ids, _ = tokenizer.encode(text) text_ids = torch.tensor(text_ids, device=device).view(1, -1) print(text + " -> res is " + str(target[torch.argmax(bert_model(text_ids)).item()]))
class Seq2SeqModel(nn.Module): """ """ def __init__(self, vocab_path, model_name="roberta"): super(Seq2SeqModel, self).__init__() self.word2ix = load_chinese_base_vocab(vocab_path) self.tokenizer = Tokenizer(self.word2ix) config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead config = BertConfig(len(self.word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) else : raise Exception("model_name_err") self.hidden_dim = config.hidden_size self.vocab_size = config.vocab_size def compute_loss(self, predictions, labels, target_mask): """ target_mask : 句子a部分和pad部分全为0, 而句子b部分为1 """ predictions = predictions.view(-1, self.vocab_size) labels = labels.view(-1) target_mask = target_mask.view(-1).float() loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") return (loss(predictions, labels) * target_mask).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响 def forward(self, input_tensor, token_type_id, position_enc=None, labels=None, device="cpu"): ## 传入输入,位置编码,token type id ,还有句子a 和句子b的长度,注意都是传入一个batch数据 ## 传入的几个值,在seq2seq 的batch iter 函数里面都可以返回 input_shape = input_tensor.shape batch_size = input_shape[0] seq_len = input_shape[1] ## 构建特殊的mask ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device) a_mask = ones.tril() # 下三角矩阵 s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float() s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float() a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask enc_layers, _ = self.bert(input_tensor, position_ids=position_enc, token_type_ids=token_type_id, attention_mask=a_mask, output_all_encoded_layers=True) squence_out = enc_layers[-1] ## 取出来最后一层输出 predictions = self.decoder(squence_out) if labels is not None: ## 计算loss ## 需要构建特殊的输出mask 才能计算正确的loss # 预测的值不用取最后sep符号的结果 因此是到-1 predictions = predictions[:, :-1].contiguous() target_mask = token_type_id[:, 1:].contiguous() loss = self.compute_loss(predictions, labels, target_mask) return predictions, loss else : return predictions def generate(self, text, out_max_length=80, beam_size=1, device="cpu", is_poem=False): # 对 一个 句子生成相应的结果 ## 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断 self.out_max_length = out_max_length input_max_length = max_length - out_max_length # print(text) token_ids, token_type_ids = self.tokenizer.encode(text, max_length=input_max_length) token_ids = torch.tensor(token_ids, device=device).view(1, -1) token_type_ids = torch.tensor(token_type_ids, device=device).view(1, -1) if is_poem:## 古诗的beam-search稍有不同 out_puts_ids = self.poem_beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device) else : out_puts_ids = self.beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device) # 解码 得到相应输出 return self.tokenizer.decode(out_puts_ids) def poem_beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ 专门针对写诗的beam-search """ ix2word = {v: k for k, v in word2ix.items()} sep_id = word2ix["[SEP]"] douhao_id = word2ix[","]# 逗号 juhao_id = word2ix["。"]# 句号 # 用来保存输出序列 output_ids = [[]] word_list = {} # 保证不重复生成 last_chars = [] yayun_save = -1 # 用来保存累计得分 output_scores = torch.zeros(token_ids.shape[0], device=device) flag = 0 # 判断第一次遇到逗号 for step in range(self.out_max_length): scores = self.forward(token_ids, token_type_ids, device=device) if step == 0: # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1) ## 计算log 分值 (beam_size, vocab_size) logit_score = torch.log_softmax(scores, dim=-1)[:, -1] logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = hype_pos / scores.shape[-1] # 行索引 indice2 = hype_pos % scores.shape[-1] # 列索引 # 下面需要更新一下输出了 new_hype_scores = [] new_hype_ids = [] next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符 index = 0 for i_1, i_2, score in zip(indice1, indice2, hype_score): i_1 = i_1.item() i_2 = i_2.item() score = score.item() if i_2 != douhao_id and i_2 != juhao_id: if i_2 not in word_list.keys(): word_list[i_2] = 1 else : # 加惩罚 word_list[i_2] += 1 score -= 1 * word_list[i_2] hype_score[index] -= 1 * word_list[i_2] if flag == 0 and i_2 == douhao_id: if len(last_chars) - 1 < index: # 说明刚开始预测便预测到逗号了,上一个字符还没有存储 break flag += 1 word = ix2word[last_chars[index]]# 找到上一个字符 记住其押韵情况 for i, each_yayun in enumerate(yayun_list): if word in each_yayun: yayun_save = i break if i_2 == juhao_id: word = ix2word[last_chars[index]] # 找押韵 给奖励 if word in yayun_list[yayun_save]: score += 5 hype_score[index] += 5 else: score -= 2 hype_score[index] -= 2 hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符 if i_2 == sep_id: # 说明解码到最后了 if score == torch.max(hype_score).item(): return hype_id[: -1] else: # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列 beam_size -= 1 else : new_hype_ids.append(hype_id) new_hype_scores.append(score) next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后 index += 1 output_ids = new_hype_ids last_chars = next_chars.copy() # 记录一下上一个字符 output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device) # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符 token_ids = token_ids[:len(output_ids)].contiguous() # 截取,因为要过滤掉已经完成预测的序列 token_type_ids = token_type_ids[: len(output_ids)].contiguous() next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1) next_token_type_ids = torch.ones_like(next_chars, device=device) # 连接 token_ids = torch.cat((token_ids, next_chars), dim=1) token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1) if beam_size < 1: break # 如果达到最大长度的话 直接把得分最高的输出序列返回把 return output_ids[output_scores.argmax().item()] def beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ sep_id = word2ix["[SEP]"] # 用来保存输出序列 output_ids = [[]] # 用来保存累计得分 output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): scores = self.forward(token_ids, token_type_ids, device=device) if step == 0: # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1) ## 计算log 分值 (beam_size, vocab_size) logit_score = torch.log_softmax(scores, dim=-1)[:, -1] logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = hype_pos / scores.shape[-1] # 行索引 indice2 = hype_pos % scores.shape[-1] # 列索引 # 下面需要更新一下输出了 new_hype_scores = [] new_hype_ids = [] # 为啥有这个[],就是因为要过滤掉结束的序列。 next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符 for i_1, i_2, score in zip(indice1, indice2, hype_score): i_1 = i_1.item() i_2 = i_2.item() score = score.item() hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符 if i_2 == sep_id: # 说明解码到最后了 if score == torch.max(hype_score).item(): # 说明找到得分最大的那个序列了 直接返回即可 return hype_id[: -1] else: # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列 beam_size -= 1 else : new_hype_ids.append(hype_id) new_hype_scores.append(score) next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后 output_ids = new_hype_ids output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device) # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符 token_ids = token_ids[:len(output_ids)].contiguous() # 截取,因为要过滤掉已经完成预测的序列 token_type_ids = token_type_ids[: len(output_ids)].contiguous() next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1) next_token_type_ids = torch.ones_like(next_chars, device=device) # 连接 token_ids = torch.cat((token_ids, next_chars), dim=1) token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1) if beam_size < 1: break # 如果达到最大长度的话 直接把得分最高的输出序列返回把 return output_ids[output_scores.argmax().item()]
class Trainer: def __init__(self): # 加载数据 self.sents_src, self.sents_tgt = read_corpus(data_path) self.tokenier = Tokenizer(word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling", target_size=len(target)) ## 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.set_device(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) def train(self, epoch): # 一个epoch的训练 self.bert_model.train() self.iteration(epoch, dataloader=self.dataloader, train=True) def save(self, save_path): """ 保存模型 """ self.bert_model.save_all_params(save_path) print("{} saved!".format(save_path)) def iteration(self, epoch, dataloader, train=True): total_loss = 0 start_time = time.time() ## 得到当前时间 step = 0 for token_ids, token_type_ids, target_ids in tqdm(dataloader, position=0, leave=True): step += 1 if step % 500 == 0: self.bert_model.eval() test_data = ["日寇在京掠夺文物详情。", "以书结缘,把欧美,港台流行的食品类食谱汇集一堂"] for text in test_data: text, text_ids = self.tokenier.encode(text) text = torch.tensor(text, device=self.device).view(1, -1) out = self.bert_model(text).squeeze(0) out_target = torch.argmax(out, dim=-1) decode_target = [target[i.item()] for i in out_target] print(decode_target) # print(target[torch.argmax(self.bert_model(text)).item()]) self.bert_model.train() # 因为传入了target标签,因此会计算loss并且返回 predictions, loss = self.bert_model(token_ids, labels=target_ids) # 反向传播 if train: # 清空之前的梯度 self.optimizer.zero_grad() # 反向传播, 获取新的梯度 loss.backward() # 用获取的梯度更新模型参数 self.optimizer.step() # 为计算当前epoch的平均loss total_loss += loss.item() end_time = time.time() spend_time = end_time - start_time # 打印训练信息 print("epoch is " + str(epoch) + ". loss is " + str(total_loss) + ". spend time is " + str(spend_time)) # 保存模型 self.save(model_save_path)
model_name=model_name, target_size=len(predicate2id)) bert_model.eval() bert_model.set_device(device) # ## 加载预训练的模型参数~ checkpoint = torch.load(relation_extrac_model, map_location="cpu") # print(checkpoint) bert_model.load_all_params(model_path=relation_extrac_model, device=device) text = [ "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部", "李治即位后,萧淑妃受宠,王皇后为了排挤萧淑妃,答应李治让身在感业寺的武则天续起头发,重新纳入后宫", "《星空黑夜传奇》是连载于起点中文网的网络小说,作者是啤酒的罪孽" ] for d in text: with torch.no_grad(): token_ids_test, segment_ids = tokenizer.encode(d, max_length=256) token_ids_test = torch.tensor(token_ids_test, device=device).view(1, -1) # 先预测subject pred_subject = bert_model.predict_subject(token_ids_test, device=device) pred_subject = pred_subject.squeeze(0) subject_texts, subject_idss = search_subject( token_ids_test[0], pred_subject.cpu()) if len(subject_texts) == 0: print("no subject predicted~") for sub_text, sub_ids in zip(subject_texts, subject_idss): print("subject is " + str(sub_text)) sub_ids = torch.tensor(sub_ids, device=device).view(1, -1) # print("sub_ids shape is " + str(sub_ids)) object_p_pred = bert_model.predict_object_predicate(
class Trainer: def __init__(self): # 加载数据 data_path = "./corpus/新闻标题文本分类/Train.txt" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_multi_classify_model.bin" self.batch_size = 16 self.lr = 1e-5 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="cls", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn) def train(self, epoch): # 一个epoch的训练 self.bert_model.train() self.iteration(epoch, dataloader=self.dataloader, train=True) def save(self, save_path): """ 保存模型 """ torch.save(self.bert_model.state_dict(), save_path) print("{} saved!".format(save_path)) def iteration(self, epoch, dataloader, train=True): total_loss = 0 start_time = time.time() ## 得到当前时间 step = 0 for token_ids, token_type_ids, target_ids in tqdm(dataloader, position=0, leave=True): step += 1 if step % 2000 == 0: self.bert_model.eval() test_data = [ "编剧梁馨月讨稿酬六六何念助阵 公司称协商解决", "西班牙BBVA第三季度净利降至15.7亿美元", "基金巨亏30亿 欲打开云天系跌停自救" ] for text in test_data: text, text_ids = self.tokenier.encode(text) text = torch.tensor(text, device=self.device).view(1, -1) print(target[torch.argmax(self.bert_model(text)).item()]) self.bert_model.train() token_ids = token_ids.to(self.device) token_type_ids = token_type_ids.to(self.device) target_ids = target_ids.to(self.device) # 因为传入了target标签,因此会计算loss并且返回 predictions, loss = self.bert_model( token_ids, labels=target_ids, ) # 反向传播 if train: # 清空之前的梯度 self.optimizer.zero_grad() # 反向传播, 获取新的梯度 loss.backward() # 用获取的梯度更新模型参数 self.optimizer.step() # 为计算当前epoch的平均loss total_loss += loss.item() end_time = time.time() spend_time = end_time - start_time # 打印训练信息 print("epoch is " + str(epoch) + ". loss is " + str(total_loss) + ". spend time is " + str(spend_time)) # 保存模型 self.save(self.model_save_path)
class ExtractDataset(Dataset): """ 针对特定数据集,定义一个相关的取数据的方式 """ def __init__(self, data, vocab_path) : ## 一般init函数是加载所有数据 super(ExtractDataset, self).__init__() # 读原始数据 # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir) self.data = data self.word2idx = load_chinese_base_vocab(vocab_path) self.idx2word = {k: v for v, k in self.word2idx.items()} self.tokenizer = Tokenizer(self.word2idx) def __getitem__(self, i): ## 得到单个数据 # print(i) d = self.data[i] token_ids, segment_ids = self.tokenizer.encode(d["text"], max_length=128) spoes = {} for s, p, o in d['spo_list']: s = self.tokenizer.encode(s)[0][1:-1] p = predicate2id[p] o = self.tokenizer.encode(o)[0][1:-1] s_idx = search(s, token_ids) o_idx = search(o, token_ids) if s_idx != -1 and o_idx != -1: s = (s_idx, s_idx + len(s) - 1) o = (o_idx, o_idx + len(o) - 1, p) if s not in spoes: spoes[s] = [] spoes[s].append(o) if spoes: # subject标签 subject_labels = np.zeros((len(token_ids), 2)) for s in spoes: subject_labels[s[0], 0] = 1 subject_labels[s[1], 1] = 1 # 随机选一个subject start, end = np.array(list(spoes.keys())).T start = np.random.choice(start) end = np.random.choice(end[end >= start]) subject_ids = (start, end) # 对应的object标签 object_labels = np.zeros((len(token_ids), len(predicate2id), 2)) for o in spoes.get(subject_ids, []): object_labels[o[0], o[2], 0] = 1 object_labels[o[1], o[2], 1] = 1 output = { "token_ids": token_ids, "token_type_ids": segment_ids, "subject_labels": subject_labels, "subject_ids": subject_ids, "object_labels": object_labels, } return output else: return self.__getitem__(i + 1) def __len__(self): return len(self.data)
class SimBertModel(BasicBert): """ """ def __init__(self, word2ix, model_name="roberta", tokenizer=None): super(SimBertModel, self).__init__() self.word2ix = word2ix if tokenizer is None: self.tokenizer = Tokenizer(word2ix) else: self.tokenizer = tokenizer config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead config = BertConfig(len(word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead config = BertConfig(len(word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight) else : raise Exception("model_name_err") self.hidden_dim = config.hidden_size self.vocab_size = len(word2ix) def compute_loss(self, predictions, labels, target_mask): loss1 = self.compute_loss_of_seq2seq(predictions, labels, target_mask) loss2 = self.compute_loss_of_similarity(predictions[:, 0]) ## 拿出cls向量 return loss1 + loss2 def compute_loss_of_seq2seq(self, predictions, labels, target_mask): predictions = predictions.view(-1, self.vocab_size) labels = labels.view(-1) target_mask = target_mask.view(-1).float() loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") return (loss(predictions, labels) * target_mask).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响 def compute_loss_of_similarity(self, y_pred): y_true = self.get_labels_of_similarity(y_pred) # 构建标签 y_true = y_true.to(self.device) norm_a = torch.nn.functional.normalize(y_pred, dim=-1, p=2) # y_pred = K.l2_normalize(y_pred, axis=1) # 句向量归一化 similarities = norm_a.matmul(norm_a.t()) # similarities = K.dot(y_pred, K.transpose(y_pred)) # 相似度矩阵 similarities = similarities - (torch.eye(y_pred.shape[0]) * 1e12).to(self.device) # 排除对角线 similarities = similarities * 30 # scale similarities = similarities loss_f = nn.CrossEntropyLoss() loss = loss_f(similarities, y_true) # loss = K.categorical_crossentropy( # y_true, similarities, from_logits=True # ) return loss def get_labels_of_similarity(self, y_pred): idxs = torch.arange(0, y_pred.shape[0]) idxs_1 = idxs[None, :] idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = (idxs_1 == idxs_2).float().argmax(dim=-1).long() return labels def forward(self, input_tensor, token_type_id, position_enc=None, labels=None): ## 传入输入,位置编码,token type id ,还有句子a 和句子b的长度,注意都是传入一个batch数据 ## 传入的几个值,在seq2seq 的batch iter 函数里面都可以返回 input_tensor = input_tensor.to(self.device) token_type_id = token_type_id.to(self.device) if position_enc is not None: position_enc = position_enc.to(self.device) if labels is not None : labels = labels.to(self.device) input_shape = input_tensor.shape batch_size = input_shape[0] seq_len = input_shape[1] ## 构建特殊的mask ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=self.device) a_mask = ones.tril() # 下三角矩阵 s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float() s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float() a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask enc_layers, _ = self.bert(input_tensor, position_ids=position_enc, token_type_ids=token_type_id, attention_mask=a_mask, output_all_encoded_layers=True) squence_out = enc_layers[-1] ## 取出来最后一层输出 predictions = self.decoder(squence_out) if labels is not None: ## 计算loss ## 需要构建特殊的输出mask 才能计算正确的loss # 预测的值不用取最后sep符号的结果 因此是到-1 predictions = predictions[:, :-1].contiguous() target_mask = token_type_id[:, 1:].contiguous() loss = self.compute_loss(predictions, labels, target_mask) return predictions, loss else : return predictions def generate(self, text, out_max_length=40, beam_size=1, is_poem=False, max_length=256): # 对 一个 句子生成相应的结果 ## 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断 self.out_max_length = out_max_length input_max_length = max_length - out_max_length # print(text) try: token_ids, token_type_ids = self.tokenizer.encode(text, max_length=input_max_length) except: # 可能是transformer的tokenizer tokenizer_out = self.tokenizer.encode_plus(text, max_length=input_max_length, truncation=True) token_ids = tokenizer_out["input_ids"] token_type_ids = tokenizer_out["token_type_ids"] token_ids = torch.tensor(token_ids, device=self.device).view(1, -1) token_type_ids = torch.tensor(token_type_ids, device=self.device).view(1, -1) if is_poem:## 古诗的beam-search稍有不同 out_puts_ids = self.beam_search_poem(text, token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=self.device) else : out_puts_ids = self.beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=self.device) return self.tokenizer.decode(out_puts_ids.cpu().numpy()) def sample_generate(self, text, out_max_length=40, top_k=30, top_p=0.0, max_length=256): input_max_length = max_length - out_max_length token_ids, token_type_ids = self.tokenizer.encode(text, max_length=input_max_length) token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long).view(1, -1) token_type_ids = torch.tensor(token_type_ids, device=self.device, dtype=torch.long).view(1, -1) device = self.device output_ids = [] sep_id = self.word2ix["[SEP]"] with torch.no_grad(): for step in range(out_max_length): scores = self.forward(token_ids, token_type_ids) logit_score = torch.log_softmax(scores[:, -1], dim=-1).squeeze(0) logit_score[self.word2ix["[UNK]"]] = -float('Inf') filtered_logits = top_k_top_p_filtering(logit_score, top_k=top_k, top_p=top_p) next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if sep_id == next_token.item(): break output_ids.append(next_token.item()) token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1) token_type_ids = torch.cat([token_type_ids, torch.ones((1, 1), device=device, dtype=torch.long)], dim=1) return self.tokenizer.decode(np.array(output_ids)) def beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ sep_id = word2ix["[SEP]"] # 用来保存输出序列 output_ids = torch.empty(1, 0, device=device, dtype=torch.long) # 用来保存累计得分 with torch.no_grad(): output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): if step == 0: scores = self.forward(token_ids, token_type_ids) # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1) else: scores = self.forward(new_input_ids, new_token_type_ids) logit_score = torch.log_softmax(scores[:, -1], dim=-1) logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = (hype_pos // scores.shape[-1]) # 行索引 indice2 = (hype_pos % scores.shape[-1]).long().reshape(-1, 1) # 列索引 # 更新得分 output_scores = hype_score output_ids = torch.cat([output_ids[indice1], indice2], dim=1).long() new_input_ids = torch.cat([token_ids, output_ids], dim=1) new_token_type_ids = torch.cat([token_type_ids, torch.ones_like(output_ids)], dim=1) end_counts = (output_ids == sep_id).sum(1) # 统计出现的end标记 best_one = output_scores.argmax() if end_counts[best_one] == 1: # 说明出现终止了~ return output_ids[best_one][:-1] else : # 保留未完成部分 flag = (end_counts < 1) # 标记未完成序列 if not flag.all(): # 如果有已完成的 token_ids = token_ids[flag] token_type_ids = token_type_ids[flag] new_input_ids = new_input_ids[flag] new_token_type_ids = new_token_type_ids[flag] output_ids = output_ids[flag] # 扔掉已完成序列 output_scores = output_scores[flag] # 扔掉已完成序列 end_counts = end_counts[flag] # 扔掉已完成end计数 beam_size = flag.sum() # topk相应变化 return output_ids[output_scores.argmax()]
class Seq2SeqModel(nn.Module): """ """ def __init__(self, word2ix, model_name="roberta"): super(Seq2SeqModel, self).__init__() self.word2ix = word2ix self.tokenizer = Tokenizer(word2ix) config = "" if model_name == "roberta": from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead config = BertConfig(len(word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead( config, self.bert.embeddings.word_embeddings.weight) elif model_name == "bert": from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead config = BertConfig(len(word2ix)) self.bert = BertModel(config) self.decoder = BertLMPredictionHead( config, self.bert.embeddings.word_embeddings.weight) else: raise Exception("model_name_err") self.hidden_dim = config.hidden_size self.vocab_size = len(word2ix) def compute_loss(self, predictions, labels, target_mask): """ target_mask : 句子a部分和pad部分全为0, 而句子b部分为1 """ predictions = predictions.view(-1, self.vocab_size) labels = labels.view(-1) target_mask = target_mask.view(-1).float() loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none") return (loss(predictions, labels) * target_mask ).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响 def forward(self, input_tensor, token_type_id, position_enc=None, labels=None, device="cpu"): ## 传入输入,位置编码,token type id ,还有句子a 和句子b的长度,注意都是传入一个batch数据 ## 传入的几个值,在seq2seq 的batch iter 函数里面都可以返回 input_shape = input_tensor.shape batch_size = input_shape[0] seq_len = input_shape[1] ## 构建特殊的mask ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device) a_mask = ones.tril() # 下三角矩阵 s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float() s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float() a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask enc_layers, _ = self.bert(input_tensor, position_ids=position_enc, token_type_ids=token_type_id, attention_mask=a_maask, output_all_encoded_layers=True) squence_out = enc_layers[-1] ## 取出来最后一层输出 predictions = self.decoder(squence_out) if labels is not None: ## 计算loss ## 需要构建特殊的输出mask 才能计算正确的loss # 预测的值不用取最后sep符号的结果 因此是到-1 predictions = predictions[:, :-1].contiguous() target_mask = token_type_id[:, 1:].contiguous() loss = self.compute_loss(predictions, labels, target_mask) return predictions, loss else: return predictions def generate(self, text, out_max_length=40, beam_size=1, device="cpu", is_poem=False, max_length=256): # 对 一个 句子生成相应的结果 ## 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断 self.out_max_length = out_max_length input_max_length = max_length - out_max_length # print(text) token_ids, token_type_ids = self.tokenizer.encode( text, max_length=input_max_length) token_ids = torch.tensor(token_ids, device=device).view(1, -1) token_type_ids = torch.tensor(token_type_ids, device=device).view(1, -1) if is_poem: ## 古诗的beam-search稍有不同 out_puts_ids = self.beam_search_poem(text, token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device) else: out_puts_ids = self.beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device) # 解码 得到相应输出 # if err is False: # return self.tokenizer.decode(out_puts_ids) return self.tokenizer.decode(out_puts_ids.cpu().numpy()) # def poem_beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): # """ # 专门针对写诗的beam-search # """ # ix2word = {v: k for k, v in word2ix.items()} # sep_id = word2ix["[SEP]"] # douhao_id = word2ix[","]# 逗号 # juhao_id = word2ix["。"]# 句号 # # 用来保存输出序列 # output_ids = [[]] # # word_list = {} # 保证不重复生成 # repeat_list = [[], [], [], [], []] # last_chars = [] # yayun_save = -1 # # 用来保存累计得分 # output_scores = torch.zeros(token_ids.shape[0], device=device) # flag = 0 # 判断第一次遇到逗号 # for step in range(self.out_max_length): # scores = self.forward(token_ids, token_type_ids, device=device) # if step == 0: # # 重复beam-size次 输入ids # token_ids = token_ids.view(1, -1).repeat(beam_size, 1) # token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1) # ## 计算log 分值 (beam_size, vocab_size) # logit_score = torch.log_softmax(scores, dim=-1)[:, -1] # logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 # ## 取topk的时候我们是展平了然后再去调用topk函数 # # 展平 # logit_score = logit_score.view(-1) # hype_score, hype_pos = torch.topk(logit_score, beam_size) # indice1 = hype_pos // scores.shape[-1] # 行索引 # indice2 = hype_pos % scores.shape[-1] # 列索引 # # 下面需要更新一下输出了 # new_hype_scores = [] # new_hype_ids = [] # new_repeat_list = [] # next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符 # index = 0 # for i_1, i_2, score in zip(indice1, indice2, hype_score): # i_1 = i_1.item() # i_2 = i_2.item() # score = score.item() # if i_2 != douhao_id and i_2 != juhao_id: # if i_2 in repeat_list[i_1]: # # 说明出现重复了 # # 扣分 # score -= 1 # hype_score[i_1] -= 1 # else : # repeat_list[i_1].append(i_2) # # if i_2 not in word_list.keys(): # # word_list[i_2] = 1 # # else : # # # 加惩罚 # # word_list[i_2] += 1 # # score -= 1 * word_list[i_2] # # hype_score[index] -= 1 * word_list[i_2] # if flag == 0 and i_2 == douhao_id and len(last_chars) != 0: # flag += 1 # word = ix2word[last_chars[index]]# 找到上一个字符 记住其押韵情况 # for i, each_yayun in enumerate(yayun_list): # if word in each_yayun: # yayun_save = i # break # if i_2 == juhao_id and len(last_chars) != 0: # word = ix2word[last_chars[i_1]] # # 找押韵 给奖励 # if word in yayun_list[yayun_save]: # score += 2 # hype_score[i_1] += 2 # else: # score -= 2 # hype_score[i_1] -= 2 # hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符 # if i_2 == sep_id: # # 说明解码到最后了 # if score == torch.max(hype_score).item(): # return hype_id[: -1], False # else: # # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列 # beam_size -= 1 # else : # new_hype_ids.append(hype_id) # new_hype_scores.append(score) # next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后 # new_repeat_list.append(repeat_list[i_1]) # index += 1 # output_ids = new_hype_ids # repeat_list = new_repeat_list ## 重复会扣分 # last_chars = next_chars.copy() # 记录一下上一个字符 # output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device) # # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符 # token_ids = token_ids[:len(output_ids)].contiguous() # 截取,因为要过滤掉已经完成预测的序列 # token_type_ids = token_type_ids[: len(output_ids)].contiguous() # next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1) # next_token_type_ids = torch.ones_like(next_chars, device=device) # # 连接 # token_ids = torch.cat((token_ids, next_chars), dim=1) # token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1) # if beam_size < 1: # break # # 如果达到最大长度的话 直接把得分最高的输出序列返回把 # err = False # try: # return output_ids[output_scores.argmax().item()], err # except: # err = True # return "本次解码出现错误", err def beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ sep_id = word2ix["[SEP]"] # 用来保存输出序列 output_ids = torch.empty(1, 0, device=device, dtype=torch.long) # 用来保存累计得分 with torch.no_grad(): output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): if step == 0: scores = self.forward(token_ids, token_type_ids, device=device) # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat( beam_size, 1) else: scores = self.forward(new_input_ids, new_token_type_ids, device=device) logit_score = torch.log_softmax(scores[:, -1], dim=-1) logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = (hype_pos // scores.shape[-1]) # 行索引 indice2 = (hype_pos % scores.shape[-1]).long().reshape( -1, 1) # 列索引 # 更新得分 output_scores = hype_score output_ids = torch.cat([output_ids[indice1], indice2], dim=1).long() new_input_ids = torch.cat([token_ids, output_ids], dim=1) new_token_type_ids = torch.cat( [token_type_ids, torch.ones_like(output_ids)], dim=1) end_counts = (output_ids == sep_id).sum(1) # 统计出现的end标记 best_one = output_scores.argmax() if end_counts[best_one] == 1: # 说明出现终止了~ return output_ids[best_one][:-1] else: # 保留未完成部分 flag = (end_counts < 1) # 标记未完成序列 if not flag.all(): # 如果有已完成的 token_ids = token_ids[flag] token_type_ids = token_type_ids[flag] new_input_ids = new_input_ids[flag] new_token_type_ids = new_token_type_ids[flag] output_ids = output_ids[flag] # 扔掉已完成序列 output_scores = output_scores[flag] # 扔掉已完成序列 end_counts = end_counts[flag] # 扔掉已完成end计数 beam_size = flag.sum() # topk相应变化 return output_ids[output_scores.argmax()] def beam_search_poem(self, text, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ yayun_pos = [] title = text.split("##")[0] if "五言律诗" in text: yayun_pos = [10, 22, 34, 46] elif "五言绝句" in text: yayun_pos = [10, 22] elif "七言律诗" in text: yayun_pos = [14, 30, 46, 62] elif "七言绝句" in text: yayun_pos = [14, 30] sep_id = word2ix["[SEP]"] douhao_id = word2ix[","] # 逗号 ix2word = {v: k for k, v in word2ix.items()} juhao_id = word2ix["。"] # 句号 repeat_word = [[] for i in range(beam_size)] # 用来保存输出序列 output_ids = torch.empty(1, 0, device=device, dtype=torch.long) last_chars = torch.empty(1, 0, device=device, dtype=torch.long) yayun_chars = (-1) * torch.ones(beam_size, dtype=torch.long) start = 0 with torch.no_grad(): output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): if step == 0: scores = self.forward(token_ids, token_type_ids, device=device) # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat( beam_size, 1) else: scores = self.forward(new_input_ids, new_token_type_ids, device=device) logit_score = torch.log_softmax(scores[:, -1], dim=-1) for i, char in enumerate(last_chars): for word in repeat_word[i]: logit_score[i, word] -= 5 for word in title: ix = word2ix.get(word, -1) if ix != -1: logit_score[i, ix] += 2 if step in yayun_pos: # print("step is " + str(step)) # print("yayun_chars is " + str(yayun_chars)) for i, char in enumerate(last_chars): if yayun_chars[i].item() != -1: yayuns = yayun_list[yayun_chars[i].item()] for char in yayuns: ix = word2ix.get(char, -1) if ix != -1: # print("char is " + str(char)) logit_score[i, ix] += 10 logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = (hype_pos // scores.shape[-1]) # 行索引 indice2 = (hype_pos % scores.shape[-1]).long().reshape( -1, 1) # 列索引 for index, each_out in zip(indice1, indice2): index = index.item() each_out = each_out.item() if each_out in repeat_word[index]: pass # repeat_word[index].append(each_out) # hype_score[index] -= 2 * repeat_word[index].count(each_out) else: repeat_word[index].append(each_out) if start < beam_size and each_out == douhao_id and len( last_chars) != 0: start += 1 word = ix2word[ last_chars[index].item()] # 找到上一个字符 记住其押韵情况 for i, each_yayun in enumerate(yayun_list): if word in each_yayun: yayun_chars[index] = i break # if each_out == juhao_id and len(last_chars) != 0: # word = ix2word[last_chars[index].item()] # if yayun_chars[index].item() != -1 and word in yayun_list[yayun_chars[index].item()]: # hype_score[index] += 10 # else: # hype_score[index] -= 5 # 更新得分 output_scores = hype_score last_chars = indice2 output_ids = torch.cat([output_ids[indice1], indice2], dim=1).long() new_input_ids = torch.cat([token_ids, output_ids], dim=1) new_token_type_ids = torch.cat( [token_type_ids, torch.ones_like(output_ids)], dim=1) end_counts = (output_ids == sep_id).sum(1) # 统计出现的end标记 best_one = output_scores.argmax() if end_counts[best_one] == 1: # 说明出现终止了~ # print(repeat_word) # print(yayun_chars) return output_ids[best_one][:-1] else: # 保留未完成部分 flag = (end_counts < 1) # 标记未完成序列 if not flag.all(): # 如果有已完成的 token_ids = token_ids[flag] token_type_ids = token_type_ids[flag] last_chars = last_chars[flag] yayun_chars = yayun_chars[flag] new_input_ids = new_input_ids[flag] new_token_type_ids = new_token_type_ids[flag] output_ids = output_ids[flag] # 扔掉已完成序列 output_scores = output_scores[flag] # 扔掉已完成序列 end_counts = end_counts[flag] # 扔掉已完成end计数 beam_size = flag.sum() # topk相应变化 flag = flag.long() new_repeat_word = [] for index, i in enumerate(flag): if i.item() == 1: new_repeat_word.append(repeat_word[index]) repeat_word = new_repeat_word # print(repeat_word) # print(yayun_chars) return output_ids[output_scores.argmax()] def beam_search_poem_v2(self, text, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"): """ beam-search操作 """ yayun_pos = [] if "五言律诗" in text: yayun_pos = [10, 22, 34, 46] elif "五言绝句" in text: yayun_pos = [10, 22] elif "七言律诗" in text: yayun_pos = [14, 30, 46, 62] elif "七言绝句" in text: yayun_pos = [14, 30] sep_id = word2ix["[SEP]"] douhao_id = word2ix[","] # 逗号 ix2word = {v: k for k, v in word2ix.items()} juhao_id = word2ix["。"] # 句号 repeat_word = [] # 用来保存输出序列 output_ids = torch.empty(1, 0, device=device, dtype=torch.long) last_chars = torch.empty(1, 0, device=device, dtype=torch.long) yayun_chars = (-1) * torch.ones(beam_size, dtype=torch.long) start = 0 with torch.no_grad(): output_scores = torch.zeros(token_ids.shape[0], device=device) for step in range(self.out_max_length): if step == 0: scores = self.forward(token_ids, token_type_ids, device=device) # 重复beam-size次 输入ids token_ids = token_ids.view(1, -1).repeat(beam_size, 1) token_type_ids = token_type_ids.view(1, -1).repeat( beam_size, 1) else: scores = self.forward(new_input_ids, new_token_type_ids, device=device) logit_score = torch.log_softmax(scores[:, -1], dim=-1) # if len(last_chars) != 0: # logit_score[last_chars] -= 5 for i, char in enumerate(last_chars): logit_score[i, char] -= 2 for word in repeat_word: logit_score[i, word] -= 1 if step in yayun_pos: # print("step is " + str(step)) # print("yayun_chars is " + str(yayun_chars)) for i, char in enumerate(last_chars): if yayun_chars[i].item() != -1: yayuns = yayun_list[yayun_chars[i].item()] for char in yayuns: ix = word2ix.get(char, -1) if ix != -1: # print("char is " + str(char)) logit_score[i, ix] += 3 logit_score = output_scores.view(-1, 1) + logit_score # 累计得分 ## 取topk的时候我们是展平了然后再去调用topk函数 # 展平 logit_score = logit_score.view(-1) hype_score, hype_pos = torch.topk(logit_score, beam_size) indice1 = (hype_pos // scores.shape[-1]) # 行索引 indice2 = (hype_pos % scores.shape[-1]).long().reshape( -1, 1) # 列索引 for index, each_out in zip(indice1, indice2): index = index.item() each_out = each_out.item() if each_out in repeat_word: pass # repeat_word[index].append(each_out) # hype_score[index] -= 2 * repeat_word[index].count(each_out) else: repeat_word.append(each_out) if start < beam_size and each_out == douhao_id and len( last_chars) != 0: start += 1 word = ix2word[ last_chars[index].item()] # 找到上一个字符 记住其押韵情况 for i, each_yayun in enumerate(yayun_list): if word in each_yayun: yayun_chars[index] = i break # if each_out == juhao_id and len(last_chars) != 0: # word = ix2word[last_chars[index].item()] # if yayun_chars[index].item() != -1 and word in yayun_list[yayun_chars[index].item()]: # hype_score[index] += 10 # else: # hype_score[index] -= 5 # 更新得分 output_scores = hype_score last_chars = indice2 output_ids = torch.cat([output_ids[indice1], indice2], dim=1).long() new_input_ids = torch.cat([token_ids, output_ids], dim=1) new_token_type_ids = torch.cat( [token_type_ids, torch.ones_like(output_ids)], dim=1) end_counts = (output_ids == sep_id).sum(1) # 统计出现的end标记 best_one = output_scores.argmax() if end_counts[best_one] == 1: # 说明出现终止了~ # print(repeat_word) # print(yayun_chars) return output_ids[best_one] else: # 保留未完成部分 flag = (end_counts < 1) # 标记未完成序列 if not flag.all(): # 如果有已完成的 token_ids = token_ids[flag] token_type_ids = token_type_ids[flag] last_chars = last_chars[flag] yayun_chars = yayun_chars[flag] new_input_ids = new_input_ids[flag] new_token_type_ids = new_token_type_ids[flag] output_ids = output_ids[flag] # 扔掉已完成序列 output_scores = output_scores[flag] # 扔掉已完成序列 end_counts = end_counts[flag] # 扔掉已完成end计数 beam_size = flag.sum() # topk相应变化 flag = flag.long() # print(repeat_word) # print(yayun_chars) return output_ids[output_scores.argmax()]
"游戏", "娱乐" ] cls_model = "./state_dict/bert_multi_classify_model.bin" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name, model_class="cls", target_size=len(target)) bert_model.to(device) bert_model.eval() ## 加载训练的模型参数~ load_recent_model(bert_model, recent_model_path=cls_model, device=device) test_data = [ "编剧梁馨月讨稿酬六六何念助阵 公司称协商解决", "西班牙BBVA第三季度净利降至15.7亿美元", "基金巨亏30亿 欲打开云天系跌停自救" ] for text in test_data: with torch.no_grad(): text, text_ids = tokenizer.encode(text) text = torch.tensor(text, device=device).view(1, -1) print(target[torch.argmax(bert_model(text)).item()])
class GPT2(BasicGPT): def __init__(self, word2ix, tokenizer=None): super().__init__() self.word2ix = word2ix if tokenizer is not None: self.tokenizer = tokenizer else: self.tokenizer = Tokenizer(word2ix) self.config = GPT2Config(len(word2ix)) self.model = GPT2LMHeadModel(self.config) def sample_generate(self, text, input_max_length=256, out_max_length=200, top_k=30, top_p=0.0, add_eos=False): token_ids, _ = self.tokenizer.encode(text, max_length=input_max_length) if not add_eos: token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long)[:-1].view(1, -1) else: token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long).view(1, -1) output_ids = [] sep_id = self.word2ix["[SEP]"] with torch.no_grad(): for step in range(out_max_length): _, scores = self.model(token_ids) logit_score = torch.log_softmax(scores[:, -1], dim=-1).squeeze(0) logit_score[self.word2ix["[UNK]"]] = -float('Inf') filtered_logits = top_k_top_p_filtering(logit_score, top_k=top_k, top_p=top_p) next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if sep_id == next_token.item(): break output_ids.append(next_token.item()) token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1) return self.tokenizer.decode(np.array(output_ids)) def sample_generate_english(self, text, input_max_length=256, out_max_length=200, top_k=30, top_p=0.0, add_eos=False): token_ids = self.tokenizer.encode(text, max_length=input_max_length, truncation=True) if add_eos: token_ids = token_ids + [self.word2ix["<EOS>"]] token_ids = torch.tensor(token_ids, device=self.device, dtype=torch.long).view(1, -1) output_ids = [] sep_id = self.word2ix["<EOS>"] with torch.no_grad(): for step in range(out_max_length): _, scores = self.model(token_ids) # print(scores.shape) logit_score = torch.log_softmax(scores[:, -1], dim=-1).squeeze(0) # print(logit_score.shape) logit_score[self.word2ix["unk"]] = -float('Inf') filtered_logits = top_k_top_p_filtering(logit_score, top_k=top_k, top_p=top_p) next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if sep_id == next_token.item(): break # pass output_ids.append(next_token.item()) token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1) return self.tokenizer.decode(output_ids) def _make_causal_mask(self, input_ids_shape: torch.Size): bsz, tgt_len = input_ids_shape mask = torch.full((tgt_len, tgt_len), 0.0).to(self.device) mask_cond = torch.arange(mask.size(-1)).to(self.device) mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 1.0) return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len) def forward(self, x, labels=None): if labels is not None: labels = labels.to(self.device) x = x.to(self.device) # input_ids = torch.tensor([[1, 2, 3, 5, -100], [4, 5, 6, -100, -100]]) attention_mask = self._make_causal_mask(x.shape) pad_mask = (labels != -100).float() attention_mask = attention_mask * pad_mask.unsqueeze(1).unsqueeze(1) loss, lm_logit = self.model(x, labels=labels, attention_mask=attention_mask) return loss, lm_logit