Beispiel #1
0
    def __init__(self):
        # 加载数据

        self.sents_src, self.sents_tgt = read_corpus(data_path)

        self.tokenier = Tokenizer(word2idx)
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx,
                                    model_name=model_name,
                                    model_class="sequence_labeling",
                                    target_size=len(target))
        ## 加载预训练的模型参数~
        self.bert_model.load_pretrain_params(model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.set_device(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = NERDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
def ner_print(model, test_data, vocab_path, device="cpu"):
    model.eval()
    word2idx = load_chinese_base_vocab(vocab_path)
    tokenier = Tokenizer(word2idx)
    trans = model.state_dict()["crf_layer.trans"]
    for text in test_data:
        decode = []
        text_encode, text_ids = tokenier.encode(text)
        text_tensor = torch.tensor(text_encode, device=device).view(1, -1)
        out = model(text_tensor).squeeze(0) # 其实是nodes
        labels = viterbi_decode(out, trans)
        starting = False
        for l in labels:
            if l > 0:
                label = target[l.item()]
                decode.append(label)
            else :
                decode.append("other")
        flag = 0
        res = {}
        for index, each_entity in enumerate(decode):
            if each_entity != "other":
                if flag != each_entity:
                    cur_text = text[index - 1]
                    if each_entity in res.keys():
                        res[each_entity].append(cur_text)
                    else :
                        res[each_entity] = [cur_text]
                    flag = each_entity
                elif flag == each_entity:
                    res[each_entity][-1] += text[index - 1]
            else :
                flag = 0
        print(res)
    def __init__(self, word2ix, model_name="roberta", tokenizer=None):
        super(Seq2SeqModel, self).__init__()
        self.word2ix = word2ix
        if tokenizer is None:
            self.tokenizer = Tokenizer(word2ix)
        else:
            self.tokenizer = tokenizer
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead
            config = BertConfig(len(word2ix))
            self.bert = BertModel(config)
            self.decoder = BertLMPredictionHead(
                config, self.bert.embeddings.word_embeddings.weight)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead
            config = BertConfig(len(word2ix))
            self.bert = BertModel(config)
            self.decoder = BertLMPredictionHead(
                config, self.bert.embeddings.word_embeddings.weight)
        else:
            raise Exception("model_name_err")

        self.hidden_dim = config.hidden_size
        self.vocab_size = len(word2ix)
 def __init__(self, data) :
     ## 一般init函数是加载所有数据
     super(BertDataset, self).__init__()
     self.data = data
     print("data size is " + str(len(data)))
     self.idx2word = {k: v for v, k in word2idx.items()}
     self.tokenizer = Tokenizer(word2idx)
Beispiel #5
0
 def __init__(self):
     # 加载数据
     data_path = "./corpus/新闻标题文本分类/Train.txt"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_path)
     self.model_name = "roberta" # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = "./bert_multi_classify_model.bin"
     self.batch_size = 16
     self.lr = 1e-5
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     self.tokenier = Tokenizer(self.word2idx)
     # 判断是否有可用GPU
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="encoder", target_size=len(target))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
    def __init__(self):
        ## 一般init函数是加载所有数据
        super(BertDataset, self).__init__()
        ## 拿到所有文件名字
        self.txts = glob.glob('./state_dict/THUCNews/*/*.txt')

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)
Beispiel #7
0
 def __init__(self, data):
     ## 一般init函数是加载所有数据
     super(ExtractDataset, self).__init__()
     # 读原始数据
     # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
     self.data = data
     self.idx2word = {k: v for v, k in word2idx.items()}
     self.tokenizer = Tokenizer(word2idx)
Beispiel #8
0
def read_corpus(dir_path, vocab_path):
    """
    读原始数据
    """
    sents_src = []
    sents_tgt = []
    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)
    files = os.listdir(dir_path)  #得到文件夹下的所有文件名称

    for file1 in files:  #遍历文件夹

        if not os.path.isdir(file1):  #判断是否是文件夹,不是文件夹才打开
            file_path = dir_path + "/" + file1
            print(file_path)
            if file_path[-3:] != "csv":
                continue
            df = pd.read_csv(file_path)
            # 先判断诗句的类型  再确定是否要构造数据

            for index, row in df.iterrows():
                if type(row[0]) is not str or type(row[3]) is not str:
                    continue
                if len(row[0].split(" ")) > 1:
                    # 说明题目里面存在空格,只要空格前面的数据
                    row[0] = row[0].split(" ")[0]

                if len(row[0]) > 10 or len(row[0]) < 1:
                    # 过滤掉题目长度过长和过短的诗句
                    continue

                encode_text = tokenizer.encode(row[3])[0]
                if word2idx["[UNK]"] in encode_text:
                    # 过滤unk字符
                    continue
                if len(row[3]) == 24 and (row[3][5] == ","
                                          or row[3][5] == "。"):
                    # 五言绝句
                    sents_src.append(row[0] + "##" + "五言绝句")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 32 and (row[3][7] == ","
                                            or row[3][7] == "。"):
                    # 七言绝句
                    sents_src.append(row[0] + "##" + "七言绝句")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 48 and (row[3][5] == ","
                                            or row[3][5] == "。"):
                    # 五言律诗
                    sents_src.append(row[0] + "##" + "五言律诗")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 64 and (row[3][7] == ","
                                            or row[3][7] == "。"):
                    # 七言律诗
                    sents_src.append(row[0] + "##" + "七言律诗")
                    sents_tgt.append(row[3])

    print("第一个诗句数据集共: " + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
Beispiel #9
0
 def __init__(self, data, vocab_path) :
     ## 一般init函数是加载所有数据
     super(ExtractDataset, self).__init__()
     # 读原始数据
     # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
     self.data = data
     self.word2idx = load_chinese_base_vocab(vocab_path)
     self.idx2word = {k: v for v, k in self.word2idx.items()}
     self.tokenizer = Tokenizer(self.word2idx)
Beispiel #10
0
 def __init__(self, word2ix, tokenizer=None):
     super().__init__()
     self.word2ix = word2ix
     if tokenizer is not None:
         self.tokenizer = tokenizer
     else:
         self.tokenizer = Tokenizer(word2ix)
     self.config = GPT2Config(len(word2ix))
     self.model = GPT2LMHeadModel(self.config)
Beispiel #11
0
 def __init__(self, sents_src, sents_tgt, vocab_path):
     ## 一般init函数是加载所有数据
     super(BertDataset, self).__init__()
     # 读原始数据
     # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
     self.sents_src = sents_src
     self.sents_tgt = sents_tgt
     self.word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
     self.idx2word = {k: v for v, k in self.word2idx.items()}
     self.tokenizer = Tokenizer(self.word2idx)
Beispiel #12
0
 def __init__(self, word2ix, model_name="roberta", tokenizer=None):
     super(Seq2SeqModel, self).__init__(word2ix=word2ix, model_name=model_name)
     self.word2ix = word2ix
     if tokenizer is None:
         self.tokenizer = Tokenizer(word2ix)
     else:
         self.tokenizer = tokenizer
         
     self.hidden_dim = self.config.hidden_size
     self.vocab_size = len(word2ix)
Beispiel #13
0
def read_corpus_2(dir_path, vocab_path):
    """读取最近的一个数据集 唐诗和宋诗 """
    sents_src = []
    sents_tgt = []
    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)
    files = os.listdir(dir_path)  #得到文件夹下的所有文件名称

    for file1 in files:  #遍历文件夹

        if not os.path.isdir(file1):  #判断是否是文件夹,不是文件夹才打开
            file_path = dir_path + "/" + file1
            print(file_path)
            # data = json.load(file_path)
            with open(file_path) as f:
                poem_list = eval(f.read())

            for each_poem in poem_list:
                string_list = each_poem["paragraphs"]
                poem = ""
                for each_s in string_list:
                    poem += each_s

                cc = opencc.OpenCC('t2s')
                poem = cc.convert(poem)

                encode_text = tokenizer.encode(poem)[0]
                if word2idx["[UNK]"] in encode_text:
                    # 过滤unk字符
                    continue
                title = cc.convert(each_poem["title"])

                if len(title) > 10 or len(title) < 1:
                    # 过滤掉题目长度过长和过短的诗句
                    continue

                if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"):
                    # 五言绝句
                    sents_src.append(title + "##" + "五言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"):
                    # 七言绝句
                    sents_src.append(title + "##" + "七言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"):
                    # 五言律诗
                    sents_src.append(title + "##" + "五言律诗")
                    sents_tgt.append(poem)
                elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"):
                    # 七言律诗
                    sents_src.append(title + "##" + "七言律诗")
                    sents_tgt.append(poem)

    print("第二个诗句数据集共:" + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
Beispiel #14
0
def read_corpus_ci(dir_path, vocab_path):
    """ 读取宋词数据集"""
    import json, sys
    import sqlite3
    from collections import OrderedDict

    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)

    try:  # Python 2
        reload(sys)
        sys.setdefaultencoding('utf-8')
    except NameError:  # Python 3
        pass

    c = sqlite3.connect(dir_path + '/ci.db')

    cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;")

    d = {"name": None, "description": None, "short_description": None}

    cursor = c.execute("SELECT rhythmic, author, content from ci;")

    d = {"rhythmic": None, "author": None, "paragraphs": None}

    # cis = []
    sents_src = []
    sents_tgt = []

    for row in cursor:
        ci = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
        ci["rhythmic"] = row[0]
        ci["author"] = row[1]
        ci["paragraphs"] = row[2].split('\n')
        string = ""
        for s in ci["paragraphs"]:
            if s == " >> " or s == "词牌介绍":
                continue
            string += s

        encode_text = tokenizer.encode(string)[0]
        if word2idx["[UNK]"] in encode_text:
            # 过滤unk字符
            continue
        sents_src.append(row[0] + "##词")
        sents_tgt.append(string)

        # cis.append(ci)

    # print(cis[:10])
    print("词共: " + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
    def __init__(self):
        super(BertDataset, self).__init__()
        self.sents_src = read_file(
            "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.src"
        )
        self.sents_tgt = read_file(
            "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.tgt"
        )
        self.sents_src = self.sents_src.split('\n')
        self.sents_tgt = self.sents_tgt.split('\n')

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)
class BertDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, data) :
        ## 一般init函数是加载所有数据
        super(BertDataset, self).__init__()
        self.data = data
        print("data size is " + str(len(data)))
        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        # print(i)
        single_data = self.data[i]
        original_text = single_data[0]
        ans_text = single_data[1]

        token_ids, token_type_ids = self.tokenizer.encode(
            original_text, ans_text, max_length=maxlen
        )
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        return output

    def __len__(self):

        return len(self.data)
class BertDataset(Dataset):
    def __init__(self):
        super(BertDataset, self).__init__()
        self.sents_src = read_file(
            "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.src"
        )
        self.sents_tgt = read_file(
            "/content/drive/My Drive/ColabNotebooks/summary/extra_dict/train.tgt"
        )
        self.sents_src = self.sents_src.split('\n')
        self.sents_tgt = self.sents_tgt.split('\n')

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        title = self.sents_tgt[i]
        content = self.sents_src[i]
        token_ids, token_type_ids = self.tokenizer.encode(content,
                                                          title,
                                                          max_length=maxlen)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        return output

        self.__getitem__(i + 1)

    def __len__(self):

        data_size = len(self.sents_src)
        return data_size
Beispiel #18
0
class NERDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, sents_src, sents_tgt):
        ## 一般init函数是加载所有数据
        super(NERDataset, self).__init__()
        # 读原始数据
        # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
        self.sents_src = sents_src
        self.sents_tgt = sents_tgt

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        # print(i)
        src = self.sents_src[i]
        tgt = self.sents_tgt[i]
        token_ids, token_type_ids = self.tokenizer.encode(src)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
            "target_id": tgt
        }
        return output

    def __len__(self):
        return len(self.sents_src)
Beispiel #19
0
class BertDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self, sents_src, sents_tgt, vocab_path):
        ## 一般init函数是加载所有数据
        super(BertDataset, self).__init__()
        # 读原始数据
        # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
        self.sents_src = sents_src
        self.sents_tgt = sents_tgt
        self.word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
        self.idx2word = {k: v for v, k in self.word2idx.items()}
        self.tokenizer = Tokenizer(self.word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        src = self.sents_src[i]
        tgt = self.sents_tgt[i]
        token_ids, token_type_ids = self.tokenizer.encode(src, tgt)
        output = {
            "token_ids": token_ids,
            "token_type_ids": token_type_ids,
        }
        return output

    def __len__(self):

        return len(self.sents_src)
 def __init__(self):
     # 加载数据
     data_path = "./corpus/细粒度NER/train.json"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_path)
     self.model_name = "roberta" # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = "./细粒度_bert_ner_model_crf.bin"
     self.batch_size = 8
     self.lr = 1e-5
     self.crf_lr = 1e-2 ##  crf层学习率为0.01
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     self.tokenier = Tokenizer(self.word2idx)
     # 判断是否有可用GPU
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来
     base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters())
     self.optimizer = torch.optim.Adam([
                                         {"params": base_params}, 
                                         {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
Beispiel #21
0
 def __init__(self, vocab_path, model_name="roberta"):
     super(Seq2SeqModel, self).__init__()
     self.word2ix = load_chinese_base_vocab(vocab_path)
     self.tokenizer = Tokenizer(self.word2ix)
     config = ""
     if model_name == "roberta":
         from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
     elif model_name == "bert":
         from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
     else :
         raise Exception("model_name_err")
         
     self.hidden_dim = config.hidden_size
     self.vocab_size = config.vocab_size
Beispiel #22
0
def ner_print(model, test_data):
    model.eval()
    idxtword = {v: k for k, v in word2idx.items()}
    tokenier = Tokenizer(word2idx)
    trans = model.state_dict()["crf_layer.trans"]
    for text in test_data:
        decode = []
        text_encode, text_ids = tokenier.encode(text)
        text_tensor = torch.tensor(text_encode, device=model.device).view(1, -1)
        out = model(text_tensor).squeeze(0) # 其实是nodes
        labels = viterbi_decode(out, trans)
        starting = False
        for l in labels:
            if l > 0:
                label = target[l.item()]
                decode.append(label)
            else :
                decode.append("O")
        flag = 0
        res = {}
        # print(decode)
        # print(text)
        decode_text = [idxtword[i] for i in text_encode]
        for index, each_entity in enumerate(decode):
            if each_entity != "O":
                if flag != each_entity:
                    cur_text = decode_text[index]
                    if each_entity in res.keys():
                        res[each_entity].append(cur_text)
                    else :
                        res[each_entity] = [cur_text]
                    flag = each_entity
                elif flag == each_entity:
                    res[each_entity][-1] += decode_text[index]
            else :
                flag = 0
        print(res)
Beispiel #23
0
    def __init__(self, vocab_path, target_size, model_name="roberta"):
        super(BertClsClassifier, self).__init__()
        self.word2ix = load_chinese_base_vocab(vocab_path)
        self.tokenizer = Tokenizer(self.word2ix)
        self.target_size = target_size
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
        else:
            raise Exception("model_name_err")

        self.final_dense = nn.Linear(config.hidden_size, self.target_size)
Beispiel #24
0
    def __init__(self, word2ix, target_size, model_name="roberta"):
        super(BertSeqLabeling, self).__init__()
        self.tokenizer = Tokenizer(word2ix)
        self.target_size = target_size
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertPredictionHeadTransform
            config = BertConfig(len(word2ix))
            self.bert = BertModel(config)
            self.transform = BertPredictionHeadTransform(config)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertPredictionHeadTransform
            config = BertConfig(len(word2ix))
            self.bert = BertModel(config)
            self.transform = BertPredictionHeadTransform(config)
        else:
            raise Exception("model_name_err")

        self.final_dense = nn.Linear(config.hidden_size, self.target_size)
    def __init__(self):
        # 加载数据
        self.sents_src, self.sents_tgt = load_data("./res.txt")

        self.tokenier = Tokenizer(word2idx)
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(word2idx,
                                    model_name=model_name,
                                    model_class="sequence_labeling_crf",
                                    target_size=len(target))
        ## 加载预训练的模型参数~
        self.bert_model.load_pretrain_params(model_path,
                                             keep_tokens=keep_tokens)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        crf_params = list(map(
            id, self.bert_model.crf_layer.parameters()))  ## 单独把crf层参数拿出来
        base_params = filter(lambda p: id(p) not in crf_params,
                             self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(
            [{
                "params": base_params
            }, {
                "params": self.bert_model.crf_layer.parameters(),
                "lr": crf_lr
            }],
            lr=lr,
            weight_decay=1e-5)
        # 声明自定义的数据加载器
        dataset = NERDataset(self.sents_src, self.sents_tgt)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)
class BertDataset(Dataset):
    """
    针对特定数据集,定义一个相关的取数据的方式
    """
    def __init__(self):
        ## 一般init函数是加载所有数据
        super(BertDataset, self).__init__()
        ## 拿到所有文件名字
        self.txts = glob.glob('./state_dict/THUCNews/*/*.txt')

        self.idx2word = {k: v for v, k in word2idx.items()}
        self.tokenizer = Tokenizer(word2idx)

    def __getitem__(self, i):
        ## 得到单个数据
        # print(i)
        text_name = self.txts[i]
        with open(text_name, "r", encoding="utf-8") as f:
            text = f.read()
        text = text.split('\n')
        if len(text) > 1:
            title = text[0]
            content = '\n'.join(text[1:])
            token_ids, token_type_ids = self.tokenizer.encode(
                content, title, max_length=maxlen)
            output = {
                "token_ids": token_ids,
                "token_type_ids": token_type_ids,
            }
            return output

        self.__getitem__(i + 1)

    def __len__(self):

        return len(self.txts)
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert
import numpy as np
import time

vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
model_name = "roberta"  # 选择模型名字
model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # roberta模型位置
recent_model_path = ""  # 用于把已经训练好的模型继续训练
model_save_path = "./state_dict/bert_model_relation_extrac.bin"
batch_size = 16
lr = 1e-5

word2idx = load_chinese_base_vocab(vocab_path)
idx2word = {v: k for k, v in word2idx.items()}
tokenizer = Tokenizer(word2idx)


def load_data(filename):
    D = []
    with open(filename, encoding='utf-8') as f:
        for l in f:
            l = json.loads(l)
            D.append({
                'text':
                l['text'],
                'spo_list': [(spo['subject'], spo['predicate'], spo['object'])
                             for spo in l['spo_list']]
            })
    return D
Beispiel #28
0
class Seq2SeqModel(nn.Module):
    """
    """
    def __init__(self, vocab_path, model_name="roberta"):
        super(Seq2SeqModel, self).__init__()
        self.word2ix = load_chinese_base_vocab(vocab_path)
        self.tokenizer = Tokenizer(self.word2ix)
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
            self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
            self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
        else :
            raise Exception("model_name_err")
            
        self.hidden_dim = config.hidden_size
        self.vocab_size = config.vocab_size


    def compute_loss(self, predictions, labels, target_mask):
        """
        target_mask : 句子a部分和pad部分全为0, 而句子b部分为1
        """
        predictions = predictions.view(-1, self.vocab_size)
        labels = labels.view(-1)
        target_mask = target_mask.view(-1).float()
        loss = nn.CrossEntropyLoss(ignore_index=0, reduction="none")
        return (loss(predictions, labels) * target_mask).sum() / target_mask.sum() ## 通过mask 取消 pad 和句子a部分预测的影响
    
    def forward(self, input_tensor, token_type_id, position_enc=None, labels=None, device="cpu"):
        ## 传入输入,位置编码,token type id ,还有句子a 和句子b的长度,注意都是传入一个batch数据
        ##  传入的几个值,在seq2seq 的batch iter 函数里面都可以返回
        input_shape = input_tensor.shape
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        ## 构建特殊的mask
        ones = torch.ones((1, 1, seq_len, seq_len), dtype=torch.float32, device=device)
        a_mask = ones.tril() # 下三角矩阵
        s_ex12 = token_type_id.unsqueeze(1).unsqueeze(2).float()
        s_ex13 = token_type_id.unsqueeze(1).unsqueeze(3).float()
        a_mask = (1.0 - s_ex12) * (1.0 - s_ex13) + s_ex13 * a_mask 
            
        enc_layers, _ = self.bert(input_tensor, position_ids=position_enc, token_type_ids=token_type_id, attention_mask=a_mask, 
                                    output_all_encoded_layers=True)
        squence_out = enc_layers[-1] ## 取出来最后一层输出

        predictions = self.decoder(squence_out)

        if labels is not None:
            ## 计算loss
            ## 需要构建特殊的输出mask 才能计算正确的loss
            # 预测的值不用取最后sep符号的结果 因此是到-1
            predictions = predictions[:, :-1].contiguous()
            target_mask = token_type_id[:, 1:].contiguous()
            loss = self.compute_loss(predictions, labels, target_mask)
            return predictions, loss 
        else :
            return predictions
    
    def generate(self, text, out_max_length=80, beam_size=1, device="cpu", is_poem=False):
        # 对 一个 句子生成相应的结果
        ## 通过输出最大长度得到输入的最大长度,这里问题不大,如果超过最大长度会进行截断
        self.out_max_length = out_max_length
        input_max_length = max_length - out_max_length
        # print(text)
        token_ids, token_type_ids = self.tokenizer.encode(text, max_length=input_max_length)
        token_ids = torch.tensor(token_ids, device=device).view(1, -1)
        token_type_ids = torch.tensor(token_type_ids, device=device).view(1, -1)
        if is_poem:## 古诗的beam-search稍有不同
            out_puts_ids = self.poem_beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device)
        else :
             out_puts_ids = self.beam_search(token_ids, token_type_ids, self.word2ix, beam_size=beam_size, device=device)
        
        # 解码 得到相应输出
        return self.tokenizer.decode(out_puts_ids)
    
    def poem_beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"):
        """
        专门针对写诗的beam-search
        """
        ix2word = {v: k for k, v in word2ix.items()}
        sep_id = word2ix["[SEP]"]
        douhao_id = word2ix[","]# 逗号
        juhao_id = word2ix["。"]# 句号
        # 用来保存输出序列
        output_ids = [[]]
        word_list = {} # 保证不重复生成
        last_chars = []
        yayun_save = -1
        # 用来保存累计得分
        output_scores = torch.zeros(token_ids.shape[0], device=device)
        flag = 0 # 判断第一次遇到逗号
        for step in range(self.out_max_length):
            scores = self.forward(token_ids, token_type_ids, device=device)
            if step == 0:
                # 重复beam-size次 输入ids
                token_ids = token_ids.view(1, -1).repeat(beam_size, 1)
                token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1)
            ## 计算log 分值 (beam_size, vocab_size)
            logit_score = torch.log_softmax(scores, dim=-1)[:, -1]
            logit_score = output_scores.view(-1, 1) + logit_score # 累计得分
            ## 取topk的时候我们是展平了然后再去调用topk函数
            # 展平
            logit_score = logit_score.view(-1)
            hype_score, hype_pos = torch.topk(logit_score, beam_size)
            indice1 = hype_pos / scores.shape[-1] # 行索引
            indice2 = hype_pos % scores.shape[-1] # 列索引

            # 下面需要更新一下输出了
            new_hype_scores = []
            new_hype_ids = []
            next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符
            index = 0
            for i_1, i_2, score in zip(indice1, indice2, hype_score):
                i_1 = i_1.item()
                i_2 = i_2.item()
                score = score.item()
                if i_2 != douhao_id and i_2 != juhao_id:
                    if i_2 not in word_list.keys():
                        word_list[i_2] = 1
                    else :
                        # 加惩罚
                        word_list[i_2] += 1
                        score -= 1 * word_list[i_2]
                        hype_score[index] -= 1 * word_list[i_2]
                if flag == 0 and i_2 == douhao_id:
                    if len(last_chars) - 1 < index:
                        # 说明刚开始预测便预测到逗号了,上一个字符还没有存储
                        break
                    flag += 1
                    word = ix2word[last_chars[index]]# 找到上一个字符 记住其押韵情况
                    for i, each_yayun in enumerate(yayun_list):
                        if word in each_yayun:
                            yayun_save = i
                            break
                if i_2 == juhao_id:
                    word = ix2word[last_chars[index]]
                    # 找押韵 给奖励
                    if word in yayun_list[yayun_save]:
                        score += 5
                        hype_score[index] += 5
                    else:
                        score -= 2
                        hype_score[index] -= 2
                hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符

                if i_2 == sep_id:
                    # 说明解码到最后了
                    if score == torch.max(hype_score).item():
                        return hype_id[: -1]
                    else:
                        # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列
                        beam_size -= 1
                else :
                    new_hype_ids.append(hype_id)
                    new_hype_scores.append(score)
                    next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后
                index += 1

            output_ids = new_hype_ids

            last_chars = next_chars.copy() # 记录一下上一个字符
            output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device)
            # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符
            token_ids = token_ids[:len(output_ids)].contiguous() # 截取,因为要过滤掉已经完成预测的序列
            token_type_ids = token_type_ids[: len(output_ids)].contiguous()

            next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1)
            next_token_type_ids = torch.ones_like(next_chars, device=device)
            # 连接
            token_ids = torch.cat((token_ids, next_chars), dim=1)
            token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1)
            if beam_size < 1:
                break

        # 如果达到最大长度的话 直接把得分最高的输出序列返回把
        return output_ids[output_scores.argmax().item()] 

    def beam_search(self, token_ids, token_type_ids, word2ix, beam_size=1, device="cpu"):
        """
        beam-search操作
        """
        sep_id = word2ix["[SEP]"]
        # 用来保存输出序列
        output_ids = [[]]
        # 用来保存累计得分
        output_scores = torch.zeros(token_ids.shape[0], device=device)
        for step in range(self.out_max_length):
            
            scores = self.forward(token_ids, token_type_ids, device=device)
            if step == 0:
                # 重复beam-size次 输入ids
                token_ids = token_ids.view(1, -1).repeat(beam_size, 1)
                token_type_ids = token_type_ids.view(1, -1).repeat(beam_size, 1)
            ## 计算log 分值 (beam_size, vocab_size)
            logit_score = torch.log_softmax(scores, dim=-1)[:, -1]
            logit_score = output_scores.view(-1, 1) + logit_score # 累计得分
            ## 取topk的时候我们是展平了然后再去调用topk函数
            # 展平
            logit_score = logit_score.view(-1)
            hype_score, hype_pos = torch.topk(logit_score, beam_size)
            indice1 = hype_pos / scores.shape[-1] # 行索引
            indice2 = hype_pos % scores.shape[-1] # 列索引

            # 下面需要更新一下输出了
            new_hype_scores = []
            new_hype_ids = []
            # 为啥有这个[],就是因为要过滤掉结束的序列。
            next_chars = [] # 用来保存新预测出来的一个字符,继续接到输入序列后面,再去预测新字符
            for i_1, i_2, score in zip(indice1, indice2, hype_score):
                i_1 = i_1.item()
                i_2 = i_2.item()
                score = score.item()
                
                hype_id = output_ids[i_1] + [i_2] # 保存所有输出的序列,而不仅仅是新预测的单个字符

                if i_2 == sep_id:
                    # 说明解码到最后了
                    if score == torch.max(hype_score).item():
                        # 说明找到得分最大的那个序列了 直接返回即可
                        return hype_id[: -1]
                    else:
                        # 完成一个解码了,但这个解码得分并不是最高,因此的话需要跳过这个序列
                        beam_size -= 1
                else :
                    new_hype_ids.append(hype_id)
                    new_hype_scores.append(score)
                    next_chars.append(i_2) # 收集一下,需要连接到当前的输入序列之后

            output_ids = new_hype_ids
           
            output_scores = torch.tensor(new_hype_scores, dtype=torch.float32, device=device)
            # 现在需要重新构造输入数据了,用上一次输入连接上这次新输出的字符,再输入bert中预测新字符
            token_ids = token_ids[:len(output_ids)].contiguous() # 截取,因为要过滤掉已经完成预测的序列
            token_type_ids = token_type_ids[: len(output_ids)].contiguous()

            next_chars = torch.tensor(next_chars, dtype=torch.long, device=device).view(-1, 1)
            next_token_type_ids = torch.ones_like(next_chars, device=device)
            # 连接
            token_ids = torch.cat((token_ids, next_chars), dim=1)
            token_type_ids = torch.cat((token_type_ids, next_token_type_ids), dim=1)
            if beam_size < 1:
                break

        # 如果达到最大长度的话 直接把得分最高的输出序列返回把
        return output_ids[output_scores.argmax().item()] 
Beispiel #29
0
class Trainer:
    def __init__(self):
        # 加载数据
        data_path = "./corpus/新闻标题文本分类/Train.txt"
        self.vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
        self.sents_src, self.sents_tgt = read_corpus(data_path)
        self.model_name = "roberta"  # 选择模型名字
        self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # roberta模型位置
        self.recent_model_path = ""  # 用于把已经训练好的模型继续训练
        self.model_save_path = "./bert_multi_classify_model.bin"
        self.batch_size = 16
        self.lr = 1e-5
        # 加载字典
        self.word2idx = load_chinese_base_vocab(self.vocab_path)
        self.tokenier = Tokenizer(self.word2idx)
        # 判断是否有可用GPU
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("device: " + str(self.device))
        # 定义模型
        self.bert_model = load_bert(self.vocab_path,
                                    model_name=self.model_name,
                                    model_class="cls",
                                    target_size=len(target))
        ## 加载预训练的模型参数~
        load_model_params(self.bert_model, self.model_path)
        # 将模型发送到计算设备(GPU或CPU)
        self.bert_model.to(self.device)
        # 声明需要优化的参数
        self.optim_parameters = list(self.bert_model.parameters())
        self.optimizer = torch.optim.Adam(self.optim_parameters,
                                          lr=self.lr,
                                          weight_decay=1e-3)
        # 声明自定义的数据加载器
        dataset = NLUDataset(self.sents_src, self.sents_tgt, self.vocab_path)
        self.dataloader = DataLoader(dataset,
                                     batch_size=self.batch_size,
                                     shuffle=True,
                                     collate_fn=collate_fn)

    def train(self, epoch):
        # 一个epoch的训练
        self.bert_model.train()
        self.iteration(epoch, dataloader=self.dataloader, train=True)

    def save(self, save_path):
        """
        保存模型
        """
        torch.save(self.bert_model.state_dict(), save_path)
        print("{} saved!".format(save_path))

    def iteration(self, epoch, dataloader, train=True):
        total_loss = 0
        start_time = time.time()  ## 得到当前时间
        step = 0
        for token_ids, token_type_ids, target_ids in tqdm(dataloader,
                                                          position=0,
                                                          leave=True):
            step += 1
            if step % 2000 == 0:
                self.bert_model.eval()
                test_data = [
                    "编剧梁馨月讨稿酬六六何念助阵 公司称协商解决", "西班牙BBVA第三季度净利降至15.7亿美元",
                    "基金巨亏30亿 欲打开云天系跌停自救"
                ]
                for text in test_data:
                    text, text_ids = self.tokenier.encode(text)
                    text = torch.tensor(text, device=self.device).view(1, -1)
                    print(target[torch.argmax(self.bert_model(text)).item()])
                self.bert_model.train()

            token_ids = token_ids.to(self.device)
            token_type_ids = token_type_ids.to(self.device)
            target_ids = target_ids.to(self.device)
            # 因为传入了target标签,因此会计算loss并且返回
            predictions, loss = self.bert_model(
                token_ids,
                labels=target_ids,
            )
            # 反向传播
            if train:
                # 清空之前的梯度
                self.optimizer.zero_grad()
                # 反向传播, 获取新的梯度
                loss.backward()
                # 用获取的梯度更新模型参数
                self.optimizer.step()

            # 为计算当前epoch的平均loss
            total_loss += loss.item()

        end_time = time.time()
        spend_time = end_time - start_time
        # 打印训练信息
        print("epoch is " + str(epoch) + ". loss is " + str(total_loss) +
              ". spend time is " + str(spend_time))
        # 保存模型
        self.save(self.model_save_path)
Beispiel #30
0
import numpy as np
import os
import json
import time
import bert_seq2seq
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert

relation_extrac_model = "./state_dict/bert_model_relation_extrac.bin"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
model_name = "roberta"  # 选择模型名字
# model_path = "./state_dict/bert-base-chinese-pytorch_model.bin"  # roberta模型位
# 加载字典
word2idx = load_chinese_base_vocab(vocab_path, simplfied=False)
tokenizer = Tokenizer(word2idx)
idx2word = {v: k for k, v in word2idx.items()}

predicate2id, id2predicate = {}, {}
with open('./corpus/三元组抽取/all_50_schemas') as f:
    for l in f:
        l = json.loads(l)
        if l['predicate'] not in predicate2id:
            id2predicate[len(predicate2id)] = l['predicate']
            predicate2id[l['predicate']] = len(predicate2id)


def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。
    """