예제 #1
0
 def __init__(self):
     # 加载数据
     data_dir = "./corpus/对联"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_dir)
     self.model_name = "roberta"  # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # roberta模型位置
     self.recent_model_path = ""  # 用于把已经训练好的模型继续训练
     self.model_save_path = "./bert_duilian_model.bin"
     self.batch_size = 16
     self.lr = 1e-5
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     # 判断是否有可用GPU
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path,
                                 model_name=self.model_name)
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters,
                                       lr=self.lr,
                                       weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = BertDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader = DataLoader(dataset,
                                  batch_size=self.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)
def ner_print(model, test_data, vocab_path, device="cpu"):
    model.eval()
    word2idx = load_chinese_base_vocab(vocab_path)
    tokenier = Tokenizer(word2idx)
    trans = model.state_dict()["crf_layer.trans"]
    for text in test_data:
        decode = []
        text_encode, text_ids = tokenier.encode(text)
        text_tensor = torch.tensor(text_encode, device=device).view(1, -1)
        out = model(text_tensor).squeeze(0) # 其实是nodes
        labels = viterbi_decode(out, trans)
        starting = False
        for l in labels:
            if l > 0:
                label = target[l.item()]
                decode.append(label)
            else :
                decode.append("other")
        flag = 0
        res = {}
        for index, each_entity in enumerate(decode):
            if each_entity != "other":
                if flag != each_entity:
                    cur_text = text[index - 1]
                    if each_entity in res.keys():
                        res[each_entity].append(cur_text)
                    else :
                        res[each_entity] = [cur_text]
                    flag = each_entity
                elif flag == each_entity:
                    res[each_entity][-1] += text[index - 1]
            else :
                flag = 0
        print(res)
 def __init__(self):
     # 加载数据
     data_path = "./corpus/细粒度NER/train.json"
     self.vocab_path = "./state_dict/roberta_wwm_vocab.txt" # roberta模型字典的位置
     self.sents_src, self.sents_tgt = read_corpus(data_path)
     self.model_name = "roberta" # 选择模型名字
     self.model_path = "./state_dict/roberta_wwm_pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = "./细粒度_bert_ner_model_crf.bin"
     self.batch_size = 8
     self.lr = 1e-5
     self.crf_lr = 1e-2 ##  crf层学习率为0.01
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     self.tokenier = Tokenizer(self.word2idx)
     # 判断是否有可用GPU
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="sequence_labeling_crf", target_size=len(target))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     crf_params = list(map(id, self.bert_model.crf_layer.parameters())) ## 单独把crf层参数拿出来
     base_params = filter(lambda p: id(p) not in crf_params, self.bert_model.parameters())
     self.optimizer = torch.optim.Adam([
                                         {"params": base_params}, 
                                         {"params": self.bert_model.crf_layer.parameters(), "lr": self.crf_lr}], lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = NERDataset(self.sents_src, self.sents_tgt, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
예제 #4
0
 def __init__(self,model_save_path="model/",data_path="corpus/",batch_size=64,lr=1e-5,model_name="roberta",device='cpu'):
     # 加载数据
     data_path = data_path+"train_data.json"
     self.vocab_path = "./state_dict/vocab.txt" # roberta模型字典的位置
     self.data = load_data(data_path)
     self.model_name = model_name # 选择模型名字
     self.model_path = "./state_dict/pytorch_model.bin" # roberta模型位置
     self.recent_model_path = "" # 用于把已经训练好的模型继续训练
     self.model_save_path = model_save_path+"bert_model_relation_extrac.bin"
     self.batch_size = batch_size
     self.lr = lr
     # 加载字典
     self.word2idx = load_chinese_base_vocab(self.vocab_path)
     # 判断是否有可用GPU
     if device =='cpu':
         self.device =device
     else:
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device: " + str(self.device))
     # 定义模型
     self.bert_model = load_bert(self.vocab_path, model_name=self.model_name, model_class="relation_extrac", target_size=len(predicate2id))
     ## 加载预训练的模型参数~
     load_model_params(self.bert_model, self.model_path)
     # 将模型发送到计算设备(GPU或CPU)
     self.bert_model.to(self.device)
     # 声明需要优化的参数
     self.optim_parameters = list(self.bert_model.parameters())
     self.optimizer = torch.optim.Adam(self.optim_parameters, lr=self.lr, weight_decay=1e-3)
     # 声明自定义的数据加载器
     dataset = ExtractDataset(self.data, self.vocab_path)
     self.dataloader =  DataLoader(dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collate_fn)
예제 #5
0
def read_corpus(dir_path, vocab_path):
    """
    读原始数据
    """
    sents_src = []
    sents_tgt = []
    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)
    files = os.listdir(dir_path)  #得到文件夹下的所有文件名称

    for file1 in files:  #遍历文件夹

        if not os.path.isdir(file1):  #判断是否是文件夹,不是文件夹才打开
            file_path = dir_path + "/" + file1
            print(file_path)
            if file_path[-3:] != "csv":
                continue
            df = pd.read_csv(file_path)
            # 先判断诗句的类型  再确定是否要构造数据

            for index, row in df.iterrows():
                if type(row[0]) is not str or type(row[3]) is not str:
                    continue
                if len(row[0].split(" ")) > 1:
                    # 说明题目里面存在空格,只要空格前面的数据
                    row[0] = row[0].split(" ")[0]

                if len(row[0]) > 10 or len(row[0]) < 1:
                    # 过滤掉题目长度过长和过短的诗句
                    continue

                encode_text = tokenizer.encode(row[3])[0]
                if word2idx["[UNK]"] in encode_text:
                    # 过滤unk字符
                    continue
                if len(row[3]) == 24 and (row[3][5] == ","
                                          or row[3][5] == "。"):
                    # 五言绝句
                    sents_src.append(row[0] + "##" + "五言绝句")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 32 and (row[3][7] == ","
                                            or row[3][7] == "。"):
                    # 七言绝句
                    sents_src.append(row[0] + "##" + "七言绝句")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 48 and (row[3][5] == ","
                                            or row[3][5] == "。"):
                    # 五言律诗
                    sents_src.append(row[0] + "##" + "五言律诗")
                    sents_tgt.append(row[3])
                elif len(row[3]) == 64 and (row[3][7] == ","
                                            or row[3][7] == "。"):
                    # 七言律诗
                    sents_src.append(row[0] + "##" + "七言律诗")
                    sents_tgt.append(row[3])

    print("第一个诗句数据集共: " + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
예제 #6
0
 def __init__(self, data, vocab_path) :
     ## 一般init函数是加载所有数据
     super(ExtractDataset, self).__init__()
     # 读原始数据
     # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
     self.data = data
     self.word2idx = load_chinese_base_vocab(vocab_path)
     self.idx2word = {k: v for v, k in self.word2idx.items()}
     self.tokenizer = Tokenizer(self.word2idx)
예제 #7
0
 def __init__(self, sents_src, sents_tgt, vocab_path):
     ## 一般init函数是加载所有数据
     super(BertDataset, self).__init__()
     # 读原始数据
     # self.sents_src, self.sents_tgt = read_corpus(poem_corpus_dir)
     self.sents_src = sents_src
     self.sents_tgt = sents_tgt
     self.word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
     self.idx2word = {k: v for v, k in self.word2idx.items()}
     self.tokenizer = Tokenizer(self.word2idx)
예제 #8
0
def read_corpus_2(dir_path, vocab_path):
    """读取最近的一个数据集 唐诗和宋诗 """
    sents_src = []
    sents_tgt = []
    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)
    files = os.listdir(dir_path)  #得到文件夹下的所有文件名称

    for file1 in files:  #遍历文件夹

        if not os.path.isdir(file1):  #判断是否是文件夹,不是文件夹才打开
            file_path = dir_path + "/" + file1
            print(file_path)
            # data = json.load(file_path)
            with open(file_path) as f:
                poem_list = eval(f.read())

            for each_poem in poem_list:
                string_list = each_poem["paragraphs"]
                poem = ""
                for each_s in string_list:
                    poem += each_s

                cc = opencc.OpenCC('t2s')
                poem = cc.convert(poem)

                encode_text = tokenizer.encode(poem)[0]
                if word2idx["[UNK]"] in encode_text:
                    # 过滤unk字符
                    continue
                title = cc.convert(each_poem["title"])

                if len(title) > 10 or len(title) < 1:
                    # 过滤掉题目长度过长和过短的诗句
                    continue

                if len(poem) == 24 and (poem[5] == "," or poem[5] == "。"):
                    # 五言绝句
                    sents_src.append(title + "##" + "五言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 32 and (poem[7] == "," or poem[7] == "。"):
                    # 七言绝句
                    sents_src.append(title + "##" + "七言绝句")
                    sents_tgt.append(poem)
                elif len(poem) == 48 and (poem[5] == "," or poem[5] == "。"):
                    # 五言律诗
                    sents_src.append(title + "##" + "五言律诗")
                    sents_tgt.append(poem)
                elif len(poem) == 64 and (poem[7] == "," or poem[7] == "。"):
                    # 七言律诗
                    sents_src.append(title + "##" + "七言律诗")
                    sents_tgt.append(poem)

    print("第二个诗句数据集共:" + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
예제 #9
0
def read_corpus_ci(dir_path, vocab_path):
    """ 读取宋词数据集"""
    import json, sys
    import sqlite3
    from collections import OrderedDict

    word2idx = load_chinese_base_vocab(vocab_path, simplfied=True)
    tokenizer = Tokenizer(word2idx)

    try:  # Python 2
        reload(sys)
        sys.setdefaultencoding('utf-8')
    except NameError:  # Python 3
        pass

    c = sqlite3.connect(dir_path + '/ci.db')

    cursor = c.execute("SELECT name, long_desc, short_desc from ciauthor;")

    d = {"name": None, "description": None, "short_description": None}

    cursor = c.execute("SELECT rhythmic, author, content from ci;")

    d = {"rhythmic": None, "author": None, "paragraphs": None}

    # cis = []
    sents_src = []
    sents_tgt = []

    for row in cursor:
        ci = OrderedDict(sorted(d.items(), key=lambda t: t[0]))
        ci["rhythmic"] = row[0]
        ci["author"] = row[1]
        ci["paragraphs"] = row[2].split('\n')
        string = ""
        for s in ci["paragraphs"]:
            if s == " >> " or s == "词牌介绍":
                continue
            string += s

        encode_text = tokenizer.encode(string)[0]
        if word2idx["[UNK]"] in encode_text:
            # 过滤unk字符
            continue
        sents_src.append(row[0] + "##词")
        sents_tgt.append(string)

        # cis.append(ci)

    # print(cis[:10])
    print("词共: " + str(len(sents_src)) + "篇")
    return sents_src, sents_tgt
예제 #10
0
    def __init__(self, vocab_path, size="base"):
        super().__init__()
        if size == "base":
            config = T5Config()
        elif size == "small":
            config = T5SmallConfig()
        else:
            raise Exception("not support this model type")
        self.model = T5ForConditionalGeneration(config)

        self.word2idx = load_chinese_base_vocab(vocab_path)
        self.tokenizer = T5PegasusTokenizer(self.word2idx)
        self.bos_id = self.word2idx["[CLS]"]
        self.eos_id = self.word2idx["[SEP]"]
        self.unk_id = self.word2idx["[UNK]"]
예제 #11
0
    def __init__(self, vocab_path, target_size, model_name="roberta"):
        super(BertClsClassifier, self).__init__()
        self.word2ix = load_chinese_base_vocab(vocab_path)
        self.tokenizer = Tokenizer(self.word2ix)
        self.target_size = target_size
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
        else:
            raise Exception("model_name_err")

        self.final_dense = nn.Linear(config.hidden_size, self.target_size)
예제 #12
0
    def __init__(self, vocab_path, target_size, model_name="roberta"):
        super(BertSeqLabelingCRF, self).__init__()
        self.word2ix = load_chinese_base_vocab(vocab_path)
        self.target_size = target_size
        config = ""
        if model_name == "roberta":
            from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertPredictionHeadTransform
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
            self.transform = BertPredictionHeadTransform(config)
        elif model_name == "bert":
            from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertPredictionHeadTransform
            config = BertConfig(len(self.word2ix))
            self.bert = BertModel(config)
            self.transform = BertPredictionHeadTransform(config)
        else:
            raise Exception("model_name_err")

        self.final_dense = nn.Linear(config.hidden_size, self.target_size)
        self.crf_layer = CRFLayer(self.target_size)
예제 #13
0
 def __init__(self, vocab_path, model_name="roberta"):
     super(Seq2SeqModel, self).__init__()
     self.word2ix = load_chinese_base_vocab(vocab_path)
     self.tokenizer = Tokenizer(self.word2ix)
     config = ""
     if model_name == "roberta":
         from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertLMPredictionHead
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
     elif model_name == "bert":
         from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertLMPredictionHead
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.decoder = BertLMPredictionHead(config, self.bert.embeddings.word_embeddings.weight)
     else :
         raise Exception("model_name_err")
         
     self.hidden_dim = config.hidden_size
     self.vocab_size = config.vocab_size
예제 #14
0
 def __init__(self, vocab_path, predicate_num, model_name="roberta"):
     super(BertRelationExtrac, self).__init__()
     self.word2ix = load_chinese_base_vocab(vocab_path)
     self.predicate_num = predicate_num 
     config = ""
     if model_name == "roberta":
         from bert_seq2seq.model.roberta_model import BertModel, BertConfig, BertPredictionHeadTransform, BertLayerNorm
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.layer_norm = BertLayerNorm(config.hidden_size)
         self.layer_norm_cond = BertLayerNorm(config.hidden_size, conditional=True)
     elif model_name == "bert":
         from bert_seq2seq.model.bert_model import BertConfig, BertModel, BertPredictionHeadTransform, BertLayerNorm
         config = BertConfig(len(self.word2ix))
         self.bert = BertModel(config)
         self.layer_norm = BertLayerNorm(config.hidden_size)
         self.layer_norm_cond = BertLayerNorm(config.hidden_size, conditional=True)
     else :
         raise Exception("model_name_err")
     
     self.subject_pred = nn.Linear(config.hidden_size, 2)
     self.activation = nn.Sigmoid()
     self.object_pred = nn.Linear(config.hidden_size, 2 * self.predicate_num)
예제 #15
0
from tqdm import tqdm
import torch.nn as nn 
from torch.optim import Adam
import numpy as np
import os
import json
import time
import glob
import bert_seq2seq
from torch.utils.data import Dataset, DataLoader
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert
import re 

vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
word2idx = load_chinese_base_vocab(vocab_path)
model_name = "roberta"  # 选择模型名字
model_path = "./state_dict/roberta_wwm_pytorch_model.bin"  # 模型位置
recent_model_path = "./state_dict/bert_math_ques_model.bin"   # 用于把已经训练好的模型继续训练
model_save_path = "./state_dict/bert_math_ques_model.bin"
batch_size = 16
lr = 1e-5
maxlen = 256
train_data_path = "./state_dict/train.ape.json"
val_data_path = "./state_dict/test.ape.json"

def remove_bucket(equation):
    """去掉冗余的括号
    """
    l_buckets, buckets = [], []
    for i, c in enumerate(equation):
예제 #16
0
import numpy as np
import os
import json
import time
import bert_seq2seq
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert, load_model_params, load_recent_model

auto_title_model = "./state_dict/bert_auto_title_model.bin"

if __name__ == "__main__":
    vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
    model_name = "roberta"  # 选择模型名字
    # model_path = "./state_dict/bert-base-chinese-pytorch_model.bin"  # roberta模型位
    # 加载字典
    word2idx, keep_tokens = load_chinese_base_vocab(vocab_path, simplfied=True)
    # 定义模型
    bert_model = load_bert(word2idx, model_name=model_name)
    bert_model.eval()
#     ## 加载预训练的模型参数~
    # load_model_params(bert_model, model_path)
    bert_model.load_state_dict(torch.load(auto_title_model, map_location="cpu"), strict=False)
    test_data = ["针对央视3·15晚会曝光的电信行业乱象工信部在公告中表示将严查央视3·15晚会曝光通信违规违法行为工信部称已约谈三大运营商有关负责人并连夜责成三大运营商和所在省通信管理局进行调查依法依规严肃处理"]
#     #  test_data = [
# #               "本文总结了十个可穿戴产品的设计原则而这些原则同样也是笔者认为是这个行业最吸引人的地方1为人们解决重复性问题2从人开始而不是从机器开始3要引起注意但不要刻意4提升用户能力而不是取代人",
# #              "2007年乔布斯向人们展示iPhone并宣称它将会改变世界还有人认为他在夸大其词然而在8年后以iPhone为代表的触屏智能手机已经席卷全球各个角落未来智能手机将会成为真正的个人电脑为人类发展做出更大的贡献", 
# #              "雅虎发布2014年第四季度财报并推出了免税方式剥离其持有的阿里巴巴集团15%股权的计划打算将这一价值约400亿美元的宝贵投资分配给股东截止发稿前雅虎股价上涨了大约7%至5145美元", 
#                 # "新华社受权于18日全文播发修改后的《中华人民共和国立法法》修改后的立法法分为“总则”“法律”“行政法规”“地方性法规自治条例和单行条例规章”“适用与备案审查”“附则”等6章共计105条"]
    for text in test_data:
        print(bert_model.generate(text, beam_size=3))
예제 #17
0
from re import I
import torch 
import sys
sys.path.append("/Users/xingzhaohu/Downloads/code/python/ml/ml_code/bert/bert_seq2seq")
from bert_seq2seq.utils import load_bert
import torch.nn.functional as F 

from bert_seq2seq.bert_cls_classifier_sigmoid import BertClsClassifier
from bert_seq2seq.model.bert_model import BertConfig
from bert_seq2seq.tokenizer import load_chinese_base_vocab

if __name__ == '__main__':
    # config = BertConfig(vocab_size=1000)
    word2ix = load_chinese_base_vocab("./state_dict/roberta_wwm_vocab.txt")
    model = BertClsClassifier(word2ix, target_size=1, model_name="")

예제 #18
0
                                           num_samples=1)
            if eos_id == next_token.item():
                break
                # pass
            output_ids.append(next_token.item())
            # token_ids = torch.cat((token_ids, next_token.long().unsqueeze(0)), dim=1)
            input_decoder_ids = torch.cat(
                (input_decoder_ids, next_token.long().unsqueeze(0)), dim=1)

    return tokenizer.decode(output_ids)


if __name__ == '__main__':
    config = T5Config()
    model = T5ForConditionalGeneration(config)

    checpoints = torch.load("./state_dict/t5-chinese/pytorch_model.bin")

    model.load_state_dict(checpoints)

    word2ix = load_chinese_base_vocab("./state_dict/t5-chinese/vocab.txt")
    tokenizer = T5PegasusTokenizer(word2ix)

    text = '从那之后,一发不可收拾。此后的近百年间,一共有十七位新娘在与君山一带失踪。有时十几年相安无事,有时短短一个月内失踪两名。一个恐怖传说迅速传开:与君山里住着一位鬼新郎,若是他看中了一位女子,便会在她出嫁的路上将她掳走,再把送亲的队伍吃掉。'
    out = sample_generate_encoder_decoder(model,
                                          text,
                                          tokenizer,
                                          word2ix["[SEP]"],
                                          word2ix["[CLS]"],
                                          device="cpu")
    print(out)
예제 #19
0
import pandas as pd
import numpy as np
import os
import json
import time
import bert_seq2seq
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab
from bert_seq2seq.utils import load_bert

relation_extrac_model = "./state_dict/bert_model_relation_extrac.bin"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_path = "./state_dict/roberta_wwm_vocab.txt"  # roberta模型字典的位置
model_name = "roberta"  # 选择模型名字
# model_path = "./state_dict/bert-base-chinese-pytorch_model.bin"  # roberta模型位
# 加载字典
word2idx = load_chinese_base_vocab(vocab_path, simplfied=False)
tokenizer = Tokenizer(word2idx)
idx2word = {v: k for k, v in word2idx.items()}

predicate2id, id2predicate = {}, {}
with open('./corpus/三元组抽取/all_50_schemas') as f:
    for l in f:
        l = json.loads(l)
        if l['predicate'] not in predicate2id:
            id2predicate[len(predicate2id)] = l['predicate']
            predicate2id[l['predicate']] = len(predicate2id)


def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。