def __init__(self): # 加载数据 src_dir = './corpus/auto_title/train.src' tgt_dir = './corpus/auto_title/train.tgt' self.sents_src = torch.load("./corpus/auto_title/train_clean.src") self.sents_tgt = torch.load("./corpus/auto_title/train_clean.tgt") # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) self.bert_model.set_device(self.device) ## 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path, keep_tokens=keep_tokens) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./state_dict/extract/train_data.json" data_dev = "./state_dict/extract/dev_data.json" self.data = load_data(data_path) self.data_dev = load_data(data_dev) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="relation_extrac", target_size=len(predicate2id)) # 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = ExtractDataset(self.data) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) self.best_f1 = 0.0
def __init__(self): # 加载数据 data_dir = "./corpus/Poetry" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_dir, self.vocab_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "./bert_model_poem.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_model_poem.bin" self.batch_size = 16 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplfied=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) self.bert_model = load_bert(word2idx, model_name=model_name) #load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) load_recent_model(self.bert_model, recent_model_path) self.bert_model.to(self.device) print(self.bert_model) self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) dataset = BertDataset() self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./corpus/粗粒度NER/example.train" self.sents_src, self.sents_tgt = read_corpus(data_path) self.tokenier = Tokenizer(word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self,model_save_path="model/",data_path="corpus/",batch_size=64,lr=1e-5,model_name="roberta",device='cpu'): # 加载数据 data_path = data_path+"train_data.json" self.vocab_path = "./state_dict/vocab.txt" # roberta模型字典的位置 self.data = load_data(data_path) self.model_name = model_name # 选择模型名字 self.model_path = "./state_dict/pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = model_save_path+"bert_model_relation_extrac.bin" self.batch_size = batch_size self.lr = lr # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) # 判断是否有可用GPU if device =='cpu': self.device =device else: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="relation_extrac", target_size=len(predicate2id)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = ExtractDataset(self.data, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 判断是否有可用GPU self.device = torch.device( "cuda:1" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # self.bert_model.load_pretrain_params(model_path, keep_tokens=keep_tokens) # 加载已经训练好的模型,继续训练 checkpoints = torch.load("./state_dict/bert_english/pytorch_model.bin") self.bert_model = load_bert(word2idx, tokenizer=tokenizer, model_name="bert") self.bert_model.load_state_dict(checkpoints, strict=False) # 将模型发送到计算设备(GPU或CPU) self.bert_model.set_device(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset() self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) self.best_rouge_1 = [0.0, 0.0, 0.0] self.global_step = 0
def __init__(self): # 加载数据 data_path = "./corpus/新闻标题文本分类/Train.txt" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_multi_classify_model.bin" self.batch_size = 16 self.lr = 1e-5 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="encoder", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path, keep_tokens=keep_tokens) # 加载已经训练好的模型,继续训练 # 将模型发送到计算设备(GPU或CPU) self.bert_model.set_device(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset() self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1") sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2") sents_src3, sents_tgt3 = read_corpus_ci(data_dir) sents_src4, sents_tgt4 = read_corpus_duilian(data_dir) self.sents_src.extend(sents_src2) self.sents_src.extend(sents_src3) self.sents_src.extend(sents_src4) self.sents_tgt.extend(sents_tgt2) self.sents_tgt.extend(sents_tgt3) self.sents_tgt.extend(sents_tgt4) ## 保存下加载的数据 下次容易加载 # torch.save(self.sents_src, "./poem_ci_duilian.src") # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt") # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path, keep_tokens=keep_tokens) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./corpus/细粒度NER/train.json" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./细粒度_bert_ner_model_crf.bin" self.batch_size = 8 self.lr = 1e-5 self.crf_lr = 1e-2 ## crf层学习率为0.01 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters()) self.optimizer = torch.optim.Adam([ {"params": base_params}, {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./corpus/对联" self.sents_src, self.sents_tgt = read_corpus(data_dir) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./Poetry_ci_duilian" self.vocab_path = "./roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1", self.vocab_path) sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2", self.vocab_path) sents_src3, sents_tgt3 = read_corpus_ci(data_dir, self.vocab_path) sents_src4, sents_tgt4 = read_corpus_duilian(data_dir) self.sents_src.extend(sents_src2) self.sents_src.extend(sents_src3) self.sents_src.extend(sents_src4) self.sents_tgt.extend(sents_tgt2) self.sents_tgt.extend(sents_tgt3) self.sents_tgt.extend(sents_tgt4) ## 保存下加载的数据 下次容易加载 # torch.save(self.sents_src, "./poem_ci_duilian.src") # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt") self.model_name = "roberta" # 选择模型名字 self.model_path = "./roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "./bert_model_poem_ci_duilian.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_model_poem_ci_duilian.bin" self.batch_size = 8 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplify=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 src_dir = './corpus/auto_title/train.src' tgt_dir = './corpus/auto_title/train.tgt' # v_src = './data/valid.src' # v_tgt = './data/valid.tgt' self.sents_src = torch.load("./corpus/auto_title/train_clean.src") self.sents_tgt = torch.load("./corpus/auto_title/train_clean.tgt") self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 # self.sents_src, self.sents_tgt = read_file(src_dir, tgt_dir) # self.valid_src,self.valid_tgt = read_file(v_src,v_tgt) self.model_name = "roberta" # 选择模型名字 # self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # 模型位置 self.recent_model_path = "./state_dict/bert_auto_title_model.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./state_dict/bert_auto_title_model.bin" self.batch_size = 16 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplfied=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 加载已经训练好的模型,继续训练 # load_recent_model(self.bert_model, self.recent_model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 self.sents_src, self.sents_tgt = load_data("./res.txt") self.tokenier = Tokenizer(word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ self.bert_model.load_pretrain_params(model_path, keep_tokens=keep_tokens) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 crf_params = list(map( id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters()) self.optimizer = torch.optim.Adam( [{ "params": base_params }, { "params": self.bert_model.crf_layer.parameters(), "lr": crf_lr }], lr=lr, weight_decay=1e-5) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 判断是否有可用GPU data = load_data(train_data_path) # print(load_val_data(val_data_path)[:10]) # os._exit(0) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path) # load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) # 加载已经训练好的模型,继续训练 # load_recent_model(self.bert_model, recent_model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-5) # 声明自定义的数据加载器 dataset = BertDataset(data) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) self.best_acc = 0.0
import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model auto_title_model = "./state_dict/bert_model_poem.bin" if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) # 定义模型 bert_model = load_bert(vocab_path, model_name=model_name, simplfied=True) bert_model.eval() # ## 加载预训练的模型参数~ # load_model_params(bert_model, model_path) checkpoint = torch.load(auto_title_model, map_location="cpu") # print(checkpoint) bert_model.load_state_dict(torch.load(auto_title_model, map_location="cpu"), strict=False) test_data = ["知己##七言绝句"] # # test_data = [ # # "本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", # # "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", # # "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元", # # "新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条"] for text in test_data:
from rouge import Rouge from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") word2idx = tokenizer.get_vocab() auto_title_model = "./state_dict/bert_english_auto_title_model.bin" device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu") maxlen = 256 if __name__ == "__main__": model_name = "bert" # 选择模型名字 # 定义模型 bert_model = load_bert(word2idx, tokenizer=tokenizer, model_name=model_name) bert_model.set_device(device) bert_model.eval() ## 加载训练的模型参数~ bert_model.load_all_params(model_path=auto_title_model, device=device) rouge = Rouge() test_file = glob.glob("./corpus/english_autotitle_test/*.json") num_file = len(test_file) rouge_1_item = [0.0, 0.0, 0.0] with open("./auto_title_res.txt", "a+") as fw: for s_file in test_file: with open(s_file, "r") as f: c = f.read() j = json.loads(c) title = j["Title"]
if _start <= _end and predicate1 == predicate2: object_text = "" for k in range(_start, _end + 1): # print(token_ids(k)) object_text += idx2word[token_ids[k]] objects.append((id2predicate[predicate1], object_text)) break return objects if __name__ == "__main__": # 定义模型 bert_model = load_bert(word2idx, model_class="relation_extrac", model_name=model_name, target_size=len(predicate2id)) bert_model.eval() bert_model.set_device(device) # ## 加载预训练的模型参数~ checkpoint = torch.load(relation_extrac_model, map_location="cpu") # print(checkpoint) bert_model.load_all_params(model_path=relation_extrac_model, device=device) text = [ "查尔斯·阿兰基斯(Charles Aránguiz),1989年4月17日出生于智利圣地亚哥,智利职业足球运动员,司职中场,效力于德国足球甲级联赛勒沃库森足球俱乐部", "李治即位后,萧淑妃受宠,王皇后为了排挤萧淑妃,答应李治让身在感业寺的武则天续起头发,重新纳入后宫", "《星空黑夜传奇》是连载于起点中文网的网络小说,作者是啤酒的罪孽" ] for d in text: with torch.no_grad(): token_ids_test, segment_ids = tokenizer.encode(d, max_length=256)
import bert_seq2seq from bert_seq2seq.utils import load_bert vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 model_path = "./state_dict/bert_math_ques_model.bin" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name, model_class="seq2seq") bert_model.to(device) bert_model.eval() ## 加载训练的模型参数~ bert_model.load_all_params(model_path=model_path, device=device) test_data = [ "王艳家买了一台洗衣机和一台电冰箱,一共花了6000元,电冰箱的价钱是洗衣机的3/5,求洗衣机的价钱.", "六1班原来男生占总数的2/5,又转来5名男生,现在男生占总数的5/11,女生有多少人?", "两个相同的数相乘,积是3600,这个数是多少.", "1加1等于几" ] for text in test_data: with torch.no_grad(): print(bert_model.generate(text, beam_size=3, device=device))
else: res[each_entity] = [cur_text] flag = each_entity elif flag == each_entity: res[each_entity][-1] += text[index - 1] else: flag = 0 print(res) if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling_crf", target_size=len(target)) bert_model.to(device) bert_model.eval() ## 加载训练的模型参数~ load_recent_model(bert_model, recent_model_path=model_path, device=device) test_data = [ "日寇在京掠夺文物详情。", "以书结缘,把欧美,港台流行的食品类食谱汇集一堂。", "明天天津下雨,不知道杨永康主任还能不能来学校吃个饭。", "美国的华莱士,我和他谈笑风生", "看包公断案的戏" ] ner_print(bert_model, test_data, device=device)
import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model auto_title_model = "./state_dict/bert_auto_title_model.bin" if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=True) # 定义模型 bert_model = load_bert(vocab_path, model_name=model_name) bert_model.eval() # ## 加载预训练的模型参数~ # load_model_params(bert_model, model_path) bert_model.load_state_dict(torch.load(auto_title_model, map_location="cpu"), strict=False) test_data = [ "针对央视3·15晚会曝光的电信行业乱象工信部在公告中表示将严查央视3·15晚会曝光通信违规违法行为工信部称已约谈三大运营商有关负责人并连夜责成三大运营商和所在省通信管理局进行调查依法依规严肃处理" ] # # test_data = [ # # "本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", # # "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", # # "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元", # # "新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条"] for text in test_data:
import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model auto_title_model = "./state_dict/bert_auto_title_model.bin" if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx, keep_tokens = load_chinese_base_vocab(vocab_path, simplfied=True) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name) bert_model.eval() # ## 加载预训练的模型参数~ # load_model_params(bert_model, model_path) bert_model.load_state_dict(torch.load(auto_title_model, map_location="cpu"), strict=False) test_data = ["针对央视3·15晚会曝光的电信行业乱象工信部在公告中表示将严查央视3·15晚会曝光通信违规违法行为工信部称已约谈三大运营商有关负责人并连夜责成三大运营商和所在省通信管理局进行调查依法依规严肃处理"] # # test_data = [ # # "本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人", # # "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", # # "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元", # # "新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条"] for text in test_data: print(bert_model.generate(text, beam_size=3)) # print(name[0])
"财经", "彩票", "房产", "股票", "家居", "教育", "科技", "社会", "时尚", "时政", "体育", "星座", "游戏", "娱乐" ] cls_model = "./state_dict/bert_multi_classify_model.bin" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # 加载字典 word2idx = load_chinese_base_vocab(vocab_path, simplfied=False) tokenizer = Tokenizer(word2idx) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name, model_class="cls", target_size=len(target)) bert_model.to(device) bert_model.eval() ## 加载训练的模型参数~ load_recent_model(bert_model, recent_model_path=cls_model, device=device) test_data = [ "编剧梁馨月讨稿酬六六何念助阵 公司称协商解决", "西班牙BBVA第三季度净利降至15.7亿美元", "基金巨亏30亿 欲打开云天系跌停自救" ] for text in test_data: with torch.no_grad(): text, text_ids = tokenizer.encode(text) text = torch.tensor(text, device=device).view(1, -1) print(target[torch.argmax(bert_model(text)).item()])