예제 #1
0
    def __init__(self):
        # 加载数据
        data_path = "./corpus/粗粒度NER/example.train"
        self.sents_src, self.sents_tgt = read_corpus(data_path)

        self.tokenier = Tokenizer(word2idx)
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx,
                                    model_name=model_name,
                                    model_class="sequence_labeling_crf",
                                    target_size=len(target))
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = NERDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #2
0
 def __init__(self):
     # 加载数据
     data_dir = "./corpus/Poetry"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_dir, self.vocab_path)
     self.model_name = "roberta"  # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # roberta模型位置
     self.recent_model_path = "./bert_model_poem.bin"  # 用于把已经训练好的模型继续训练
     self.model_save_path = "./bert_model_poem.bin"
     self.batch_size = 16
     self.lr = 1e-5
     # 判断是否有可用GPU
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path,
                                 model_name=self.model_name,
                                 simplfied=True)
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters,
                                       lr=self.lr,
                                       weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader = DataLoader(dataset,
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)
예제 #3
0
 def __init__(self,model_save_path="model/",data_path="corpus/",batch_size=64,lr=1e-5,model_name="roberta",device='cpu'):
     # 加载数据
     data_path = data_path+"train_data.json"
     self.vocab_path = "./state_dict/vocab.txt" # roberta模型字典的位置
     self.data = load_data(data_path)
     self.model_name = model_name # 选择模型名字
     self.model_path = "./state_dict/pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = model_save_path+"bert_model_relation_extrac.bin"
     self.batch_size = batch_size
     self.lr = lr
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     # 判断是否有可用GPU
     if device =='cpu':
         self.device =device
     else:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="relation_extrac", target_size=len(predicate2id))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = ExtractDataset(self.data, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
 def __init__(self):
     # 加载数据
     data_path = "./corpus/细粒度NER/train.json"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_path)
     self.model_name = "roberta" # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = "./细粒度_bert_ner_model_crf.bin"
     self.batch_size = 8
     self.lr = 1e-5
     self.crf_lr = 1e-2 ##  crf层学习率为0.01
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     self.tokenier = Tokenizer(self.word2idx)
     # 判断是否有可用GPU
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来
     base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters())
     self.optimizer = torch.optim.Adam([
                                         {"params": base_params}, 
                                         {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
예제 #5
0
    def __init__(self):
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx, model_name=model_name)
        ## 加载预训练的模型参数~

        load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens)
        # 加载已经训练好的模型,继续训练
        # load_recent_model(self.bert_model, recent_model_path)

        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset()
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #6
0
 def __init__(self):
     # 加载数据
     data_path = "./corpus/新闻标题文本分类/Train.txt"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_path)
     self.model_name = "roberta" # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = "./bert_multi_classify_model.bin"
     self.batch_size = 16
     self.lr = 1e-5
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     self.tokenier = Tokenizer(self.word2idx)
     # 判断是否有可用GPU
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="encoder", target_size=len(target))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
예제 #7
0
    def __init__(self):
        # 加载数据
        data_path = "./corpus/三元组抽取/train_data.json"
        self.data = load_data(data_path)

        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx,
                                    model_name=model_name,
                                    model_class="relation_extrac",
                                    target_size=len(predicate2id))
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = ExtractDataset(self.data)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #8
0
    def __init__(self):
        # 加载数据
        data_dir = "./corpus/对联"
        self.sents_src, self.sents_tgt = read_corpus(data_dir)

        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx, model_name=model_name)
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, self.model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #9
0
    def __init__(self):
        # 加载数据
        data_dir = "./Poetry_ci_duilian"
        self.vocab_path = "./roberta_wwm_vocab.txt"  # roberta模型字典的位置
        self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1",
                                                     self.vocab_path)
        sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2",
                                               self.vocab_path)
        sents_src3, sents_tgt3 = read_corpus_ci(data_dir, self.vocab_path)
        sents_src4, sents_tgt4 = read_corpus_duilian(data_dir)
        self.sents_src.extend(sents_src2)
        self.sents_src.extend(sents_src3)
        self.sents_src.extend(sents_src4)

        self.sents_tgt.extend(sents_tgt2)
        self.sents_tgt.extend(sents_tgt3)
        self.sents_tgt.extend(sents_tgt4)

        ## 保存下加载的数据 下次容易加载
        # torch.save(self.sents_src, "./poem_ci_duilian.src")
        # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt")

        self.model_name = "roberta"  # 选择模型名字
        self.model_path = "./roberta_wwm_pytorch_model.bin"  # roberta模型位置
        self.recent_model_path = "./bert_model_poem_ci_duilian.bin"  # 用于把已经训练好的模型继续训练
        self.model_save_path = "./bert_model_poem_ci_duilian.bin"
        self.batch_size = 8
        self.lr = 1e-5
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(self.vocab_path,
                                    model_name=self.model_name,
                                    simplify=True)
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, self.model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=self.lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path)
        self.dataloader = DataLoader(dataset,
                                     batch_size=self.batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #10
0
    def __init__(self):
        # 加载数据
        src_dir = './corpus/auto_title/train.src'
        tgt_dir = './corpus/auto_title/train.tgt'
        # v_src = './data/valid.src'
        # v_tgt = './data/valid.tgt'

        self.sents_src = torch.load("./corpus/auto_title/train_clean.src")
        self.sents_tgt = torch.load("./corpus/auto_title/train_clean.tgt")

        self.vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
        # self.sents_src, self.sents_tgt = read_file(src_dir, tgt_dir)
        # self.valid_src,self.valid_tgt = read_file(v_src,v_tgt)
        self.model_name = "roberta"  # 选择模型名字
        # self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # roberta模型位置
        self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # 模型位置
        self.recent_model_path = "./state_dict/bert_auto_title_model.bin"  # 用于把已经训练好的模型继续训练
        self.model_save_path = "./state_dict/bert_auto_title_model.bin"
        self.batch_size = 16
        self.lr = 1e-5

        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(self.vocab_path,
                                    model_name=self.model_name,
                                    simplfied=True)
        ## 加载预训练的模型参数~

        load_model_params(self.bert_model, self.model_path)
        # 加载已经训练好的模型,继续训练
        # load_recent_model(self.bert_model, self.recent_model_path)

        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=self.lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path)
        self.dataloader = DataLoader(dataset,
                                     batch_size=self.batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #11
0
    def __init__(self):
        # 加载数据
        data_dir = "./Poetry_ci_duilian"

        self.sents_src, self.sents_tgt = read_corpus(data_dir + "/Poetry1")
        sents_src2, sents_tgt2 = read_corpus_2(data_dir + "/Poetry2")
        sents_src3, sents_tgt3 = read_corpus_ci(data_dir)
        sents_src4, sents_tgt4 = read_corpus_duilian(data_dir)
        self.sents_src.extend(sents_src2)
        self.sents_src.extend(sents_src3)
        self.sents_src.extend(sents_src4)

        self.sents_tgt.extend(sents_tgt2)
        self.sents_tgt.extend(sents_tgt3)
        self.sents_tgt.extend(sents_tgt4)

        ## 保存下加载的数据 下次容易加载
        # torch.save(self.sents_src, "./poem_ci_duilian.src")
        # torch.save(self.sents_tgt, "./poem_ci_duilian.tgt")

        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx, model_name=model_name)
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = BertDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #12
0
    def __init__(self):
        # 加载数据
        self.sents_src, self.sents_tgt = load_data("./res.txt")

        self.tokenier = Tokenizer(word2idx)
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx,
                                    model_name=model_name,
                                    model_class="sequence_labeling_crf",
                                    target_size=len(target))
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, model_path, keep_tokens=keep_tokens)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        crf_params = list(map(
            id, self.bert_model.crf_layer.parameters()))  ## 单独把crf层参数拿出来
        base_params = filter(lambda p: id(p) not in crf_params,
                             self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(
            [{
                "params": base_params
            }, {
                "params": self.bert_model.crf_layer.parameters(),
                "lr": crf_lr
            }],
            lr=lr,
            weight_decay=1e-5)
        # 声明自定义的数据加载器
        dataset = NERDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
예제 #13
0
파일: test.py 프로젝트: hzhip/bert_seq2seq
import torch 
import torch.nn as nn 
import sys
sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq")
from torch.optim import Adam
import pandas as pd
import numpy as np
import os
import json
import time
import bert_seq2seq
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model

auto_title_model = "./state_dict/bert_model_poem.bin"

if __name__ == "__main__":
    vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
    model_name = "roberta"  # 选择模型名字
    # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin"  # roberta模型位
    # 加载字典
    word2idx, keep_tokens = load_chinese_base_vocab(vocab_path, simplfied=True)
    # 定义模型
    bert_model = load_bert(word2idx, model_name=model_name)
    load_model_params(bert_model, "./state_dict/roberta_wwm_pytorch_model.bin", keep_tokens=keep_tokens)

    for name, params in bert_model.named_parameters():
        print(name)