def __init__(self): # 加载数据 data_path = "./corpus/粗粒度NER/example.train" self.sents_src, self.sents_tgt = read_corpus(data_path) self.tokenier = Tokenizer(word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./corpus/Poetry" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_dir, self.vocab_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "./bert_model_poem.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_model_poem.bin" self.batch_size = 16 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplfied=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self,model_save_path="model/",data_path="corpus/",batch_size=64,lr=1e-5,model_name="roberta",device='cpu'): # 加载数据 data_path = data_path+"train_data.json" self.vocab_path = "./state_dict/vocab.txt" # roberta模型字典的位置 self.data = load_data(data_path) self.model_name = model_name # 选择模型名字 self.model_path = "./state_dict/pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = model_save_path+"bert_model_relation_extrac.bin" self.batch_size = batch_size self.lr = lr # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) # 判断是否有可用GPU if device =='cpu': self.device =device else: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="relation_extrac", target_size=len(predicate2id)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = ExtractDataset(self.data, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./corpus/细粒度NER/train.json" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./细粒度_bert_ner_model_crf.bin" self.batch_size = 8 self.lr = 1e-5 self.crf_lr = 1e-2 ## crf层学习率为0.01 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters()) self.optimizer = torch.optim.Adam([ {"params": base_params}, {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) # 加载已经训练好的模型,继续训练 # load_recent_model(self.bert_model, recent_model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset() self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./corpus/新闻标题文本分类/Train.txt" self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_path) self.model_name = "roberta" # 选择模型名字 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_multi_classify_model.bin" self.batch_size = 16 self.lr = 1e-5 # 加载字典 self.word2idx = load_chinese_base_vocab(self.vocab_path) self.tokenier = Tokenizer(self.word2idx) # 判断是否有可用GPU self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="encoder", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_path = "./corpus/三元组抽取/train_data.json" self.data = load_data(data_path) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="relation_extrac", target_size=len(predicate2id)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = ExtractDataset(self.data) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./corpus/对联" self.sents_src, self.sents_tgt = read_corpus(data_dir) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./Poetry_ci_duilian" self.vocab_path = "./roberta_wwm_vocab.txt" # roberta模型字典的位置 self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1", self.vocab_path) sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2", self.vocab_path) sents_src3, sents_tgt3 = read_corpus_ci(data_dir, self.vocab_path) sents_src4, sents_tgt4 = read_corpus_duilian(data_dir) self.sents_src.extend(sents_src2) self.sents_src.extend(sents_src3) self.sents_src.extend(sents_src4) self.sents_tgt.extend(sents_tgt2) self.sents_tgt.extend(sents_tgt3) self.sents_tgt.extend(sents_tgt4) ## 保存下加载的数据 下次容易加载 # torch.save(self.sents_src, "./poem_ci_duilian.src") # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt") self.model_name = "roberta" # 选择模型名字 self.model_path = "./roberta_wwm_pytorch_model.bin" # roberta模型位置 self.recent_model_path = "./bert_model_poem_ci_duilian.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./bert_model_poem_ci_duilian.bin" self.batch_size = 8 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplify=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 src_dir = './corpus/auto_title/train.src' tgt_dir = './corpus/auto_title/train.tgt' # v_src = './data/valid.src' # v_tgt = './data/valid.tgt' self.sents_src = torch.load("./corpus/auto_title/train_clean.src") self.sents_tgt = torch.load("./corpus/auto_title/train_clean.tgt") self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 # self.sents_src, self.sents_tgt = read_file(src_dir, tgt_dir) # self.valid_src,self.valid_tgt = read_file(v_src,v_tgt) self.model_name = "roberta" # 选择模型名字 # self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置 self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # 模型位置 self.recent_model_path = "./state_dict/bert_auto_title_model.bin" # 用于把已经训练好的模型继续训练 self.model_save_path = "./state_dict/bert_auto_title_model.bin" self.batch_size = 16 self.lr = 1e-5 # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, simplfied=True) ## 加载预训练的模型参数~ load_model_params(self.bert_model, self.model_path) # 加载已经训练好的模型,继续训练 # load_recent_model(self.bert_model, self.recent_model_path) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path) self.dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 data_dir = "./Poetry_ci_duilian" self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1") sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2") sents_src3, sents_tgt3 = read_corpus_ci(data_dir) sents_src4, sents_tgt4 = read_corpus_duilian(data_dir) self.sents_src.extend(sents_src2) self.sents_src.extend(sents_src3) self.sents_src.extend(sents_src4) self.sents_tgt.extend(sents_tgt2) self.sents_tgt.extend(sents_tgt3) self.sents_tgt.extend(sents_tgt4) ## 保存下加载的数据 下次容易加载 # torch.save(self.sents_src, "./poem_ci_duilian.src") # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt") # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 self.optim_parameters = list(self.bert_model.parameters()) self.optimizer = torch.optim.Adam(self.optim_parameters, lr=lr, weight_decay=1e-3) # 声明自定义的数据加载器 dataset = BertDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
def __init__(self): # 加载数据 self.sents_src, self.sents_tgt = load_data("./res.txt") self.tokenier = Tokenizer(word2idx) # 判断是否有可用GPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("device: " + str(self.device)) # 定义模型 self.bert_model = load_bert(word2idx, model_name=model_name, model_class="sequence_labeling_crf", target_size=len(target)) ## 加载预训练的模型参数~ load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens) # 将模型发送到计算设备(GPU或CPU) self.bert_model.to(self.device) # 声明需要优化的参数 crf_params = list(map( id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来 base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters()) self.optimizer = torch.optim.Adam( [{ "params": base_params }, { "params": self.bert_model.crf_layer.parameters(), "lr": crf_lr }], lr=lr, weight_decay=1e-5) # 声明自定义的数据加载器 dataset = NERDataset(self.sents_src, self.sents_tgt) self.dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
import torch import torch.nn as nn import sys sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq") from torch.optim import Adam import pandas as pd import numpy as np import os import json import time import bert_seq2seq from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model auto_title_model = "./state_dict/bert_model_poem.bin" if __name__ == "__main__": vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置 model_name = "roberta" # 选择模型名字 # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin" # roberta模型位 # 加载字典 word2idx, keep_tokens = load_chinese_base_vocab(vocab_path, simplfied=True) # 定义模型 bert_model = load_bert(word2idx, model_name=model_name) load_model_params(bert_model, "./state_dict/roberta_wwm_pytorch_model.bin", keep_tokens=keep_tokens) for name, params in bert_model.named_parameters(): print(name)