checkpoint_path = '/root/kg/bert/CPM_LM_2.6B_TF/model.ckpt' spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model' def pre_tokenize(text): """分词前处理函数 """ return [ w.replace(' ', u'\u2582').replace('\n', u'\u2583') for w in jieba.cut(text, cut_all=False) ] tokenizer = SpTokenizer(spm_path, token_start=None, token_end=None, pre_tokenize=pre_tokenize, token_translate={u'\u2583': '<cls>'}) # 建立分词器 model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2') # 建立模型,加载权重 class TextExpansion(AutoRegressiveDecoder): """基于随机采样的文本续写 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids = np.concatenate([inputs[0], output_ids], 1) return model.predict(token_ids)[:, -1]
spm_path = '/root/kg/bert/CPM_LM_2.6B_TF/chinese_vocab.model' def pre_tokenize(text): """分词前处理函数 """ return [ w.replace(' ', u'\u2582').replace('\n', u'\u2583') for w in jieba.cut(text, cut_all=False) ] tokenizer = SpTokenizer( spm_path, token_start=None, token_end=None, pre_tokenize=pre_tokenize, # '\u2583'为换行符,此处为将换行符替换为'<cls>'特殊符. token_translate={u'\u2583': '<cls>'} ) # 建立分词器 model = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, model='gpt2' ) # 建立模型,加载权重 class TextExpansion(AutoRegressiveDecoder): """基于随机采样的文本续写 """ @AutoRegressiveDecoder.wraps(default_rtype='probas') def predict(self, inputs, output_ids, states): token_ids = np.concatenate([inputs[0], output_ids], 1)
""" D = [] with open(filename, encoding='utf-8') as f: for l in f: title, content = l.strip().split('\t') D.append((title, content)) return D # 加载数据集 train_data = load_data('/root/csl/train.tsv') valid_data = load_data('/root/csl/val.tsv') test_data = load_data('/root/csl/test.tsv') # 加载分词器 tokenizer = SpTokenizer(spm_path, token_start=None, token_end='</s>') keep_tokens = json.load(open(keep_tokens_path)) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): batch_c_token_ids, batch_t_token_ids = [], [] for is_end, (title, content) in self.sample(random): c_token_ids, _ = tokenizer.encode(content, maxlen=max_c_len) t_token_ids, _ = tokenizer.encode(title, maxlen=max_t_len) batch_c_token_ids.append(c_token_ids) batch_t_token_ids.append([0] + t_token_ids) if len(batch_c_token_ids) == self.batch_size or is_end: batch_c_token_ids = sequence_padding(batch_c_token_ids)
""" @File : load_albert.py @Author : Pengy @Date : 2020/9/28 @Description : Input your description here ... """ from bert4keras.models import build_transformer_model from bert4keras.tokenizers import SpTokenizer from keras.layers import LSTM, Dense from keras.models import Model import numpy as np config_path = '../Models/albert_base_v2/albert_base/albert_config.json' checkpoint_path = '../Models/albert_base_v2/albert_base/model.ckpt-best' vocab_path = '../Models/albert_base_v2/albert_base/30k-clean.vocab' spm_path = '../Models/albert_base_v2/albert_base/30k-clean.model' tokenizer = SpTokenizer(spm_path) model = build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path, model='albert') model.summary() token_ids, segment_ids = tokenizer.encode('language model') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) output = LSTM(64)(model.output) output = Dense(32)(output) my_model = Model(model.input, output) my_model.summary()
checkpoint_path = os.path.join(bert_path, 'model.ckpt-best') dict_path = os.path.join(bert_path, '30k-clean.vocab') spm_path = os.path.join(bert_path, '30k-clean.model') # load data def load_data(filename): D = [] with open(filename, encoding='gb2312') as f: for l in f: text, label = l.strip().split('\t') D.append((text, int(label))) return D # Create a tokenizer tokenizer = SpTokenizer(spm_path) #tokenizer = Tokenizer(dict_path, do_lower_case=True) class data_generator(DataGenerator): """data generator """ def __iter__(self, random=False): batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for is_end, (text, label) in self.sample(random): token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([label]) if len(batch_token_ids) == self.batch_size or is_end: batch_token_ids = sequence_padding(batch_token_ids)
epochs = 100000 summary_rate = 0.25 t_maxlen = maxlen // 4 s_maxlen = maxlen - t_maxlen # T5配置 config_path = '/root/kg/bert/mt5/mt5_base/mt5_base_config.json' checkpoint_path = '/root/kg/bert/mt5/mt5_base/model.ckpt-1000000' spm_path = '/root/kg/bert/mt5/sentencepiece.model' # PEGASUS dict_path_1 = '/root/kg/bert/chinese_pegasus_L-12_H-768_A-12/vocab.txt' dict_path_2 = '/root/kg/bert/chinese_t5_pegasus_base/vocab.txt' # 构建词表 sp_tokenizer = SpTokenizer(spm_path, token_start=None, token_end=None) token_dict = load_vocab(dict_path_1) keep_tokens, new_token_dict, n = [], {}, 0 for t, _ in sorted(token_dict.items(), key=lambda s: s[1]): if n < 106: new_token_dict[t] = n n += 1 continue if t.startswith('##'): i = sp_tokenizer.token_to_id(t[2:]) if i == 2: i = sp_tokenizer.token_to_id(u'\u2581' + t) else: i = sp_tokenizer.token_to_id(u'\u2581' + t) if i == 2: i = sp_tokenizer.token_to_id(t)
def setUpClass(cls) -> None: model_path = '../models/mt5_base/sentencepiece_cn.model' cls.raw_tokenizer = SpTokenizer(model_path, token_start=None, token_end='</s>') cls.my_tokenizer = SentencePieceTokenizer(model_path, token_start=None, token_end='</s>')