def __init__(self): self.maxlen = 128 self.config_path = 'chinese_simbert_L-12_H-768_A-12/bert_config.json' self.checkpoint_path = 'chinese_simbert_L-12_H-768_A-12/bert_model.ckpt' self.dict_path = 'chinese_simbert_L-12_H-768_A-12/vocab.txt' # 加载并精简词表,建立分词器 self.token_dict, self.keep_tokens = load_vocab( dict_path=self.dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) # 建立加载模型 self.bert = build_transformer_model( self.config_path, self.checkpoint_path, with_pool='linear', application='unilm', keep_tokens=self.keep_tokens, # 只保留keep_tokens中的字,精简原字表 return_keras_model=False, ) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.seq2seq = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[1])
def build_model(): config_path = GUWEN_CONFIG_PATH if use_guwenbert else ROBERTA_CONFIG_PATH checkpoint_path = GUWEN_CHECKPOINT_PATH if use_guwenbert else ROBERTA_CHECKPOINT_PATH dict_path = GUWEN_DICT_PATH if use_guwenbert else ROBERTA_DICT_PATH token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) model = build_transformer_model( config_path, checkpoint_path, application='unilm', # keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ) # 加载训练好的模型 model.load_weights(BEST_MODEL_PATH) autotitle = AutoTitle(start_id=None, end_id=tokenizer._token_end_id, maxlen=50) text = '却话巴山夜雨时' token_ids, segment_ids = tokenizer.encode(text) inputs = np.array([token_ids, segment_ids]) inputs = [np.array([i]) for i in inputs] print(autotitle.predict(inputs, np.empty((1, 0), dtype=int), states=None)) print(autotitle.generate("却话巴山夜雨时")) return autotitle
def buildmodel(self): self.token_dict, self.keep_tokens = load_vocab( dict_path=self.dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], ) self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True) if self.pretrain_type == 'albert': model = build_transformer_model( config_path, checkpoint_path, model='albert', with_mlm=True, keep_tokens=self.keep_tokens, ) elif self.pretrain_type == 'bert': model = build_transformer_model( config_path, checkpoint_path, model='bert', with_mlm=True, keep_tokens=self.keep_tokens, ) output = Lambda(lambda x: x[:, 1:self.max_a_len + 1])(model.output) #print(output.shape) self.model = Model(model.input, output) self.model.compile(loss=self.masked_cross_entropy, optimizer=Adam(self.lr)) self.model.summary()
def loadDic(dicPath): token_dict, keep_tokens = load_vocab( dict_path=dicPath, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) return token_dict, keep_tokens, tokenizer
def __iter__(self): token_dict,keep_tokens = load_vocab(dict_path=Config.COCAB_PATH,simplified=True,startswith=['[PAD]','[UNK]','[SEP]','[MASK]']) tokenizer = Tokenizer(token_dict,do_lower_case=True) model_path = Config.seq2seq_model_path if not os.path.exists(model_path): raise Exception('dd') set_session(sess) model = load_model(model_path) self.autotitle = AutoTitle(model=model,tokenizer=tokenizer,start_id=None,end_id=tokenizer._token_end_id,maxlen=128)
def get_keep_tokens(): counts = json.load(open('counts.json')) del counts['[CLS]'] del counts['[SEP]'] token_dict = load_vocab(BaseConfig.dict_path) freqs = [ counts.get(i, 0) for i, j in sorted(token_dict.items(), key=lambda s: s[1]) ] keep_tokens = list(np.argsort(freqs)[::-1]) return keep_tokens
def initTokenizer(dicPath='../data/dic.txt', diclenth=1000, handle=EnglishDicHandle): token_dict, keep_tokens = load_vocab( dict_path=dicPath, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) token_dict = handle(token_dict, diclenth) tokenizer = Tokenizer(token_dict, do_lower_case=True) return tokenizer, token_dict
def create_tokenizer(self): keep_tokens = [] if self.simplified_tokenizer: token_dict, keep_tokens = load_vocab( dict_path=self.pre_trained_model_dict_path, simplified=True, startswith=["[PAD]", "[UNK]", "[CLS]", "[SEP]"], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) else: tokenizer = Tokenizer(self.pre_trained_model_dict_path, do_lower_case=True) return tokenizer, keep_tokens
def build_keras4bert(self): import bert4keras from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer,load_vocab import os self.embedding_type = 'bert' config_path = os.path.join(self.corpus_path, 'bert_config.json') checkpoint_path = os.path.join(self.corpus_path, 'bert_model.ckpt') dict_path = os.path.join(self.corpus_path, 'vocab.txt') self.model = bert4keras.models.build_transformer_model(config_path=config_path, checkpoint_path=checkpoint_path) # 加载并精简词表,建立分词器 self.token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) self.vocab_size = len(self.token_dict) self.tokenizer = Tokenizer(self.token_dict, do_lower_case=True)
def create_tokenizer(sentences: typing.List[str]) -> typing.Tuple[Tokenizer, typing.List]: """ 根据新的数据集,精简词表,重新创建tokenizer Args: sentences: 评论数据句子的列表 Returns: tokenizer,keep_tokens """ # 加载下载的词表 _token_dict = load_vocab(settings.DICT_PATH) _tokenizer = Tokenizer(_token_dict, do_lower_case=True) # 统计词频 counter = Counter() for sentence in sentences: _tokens = _tokenizer.tokenize(sentence) # 统计词频时,移除[CLS]和[SEP]字符 counter.update(_tokens[1:-1]) # 过滤低频词 tokens_and_counts = [(token, count) for token, count in counter.items() if count >= settings.MIN_WORD_FREQUENCY] # 按词频倒序排列 sorted_tokens_and_counts = sorted(tokens_and_counts, key=lambda x: -x[1]) # 去掉词频,只保留token most_tokens = [token for token, count in sorted_tokens_and_counts] # 构建新词典 tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]'] + most_tokens keep_tokens = [] token_dict = {} for token in tokens: if token in _token_dict: token_dict[token] = len(token_dict) keep_tokens.append(_token_dict[token]) # 使用新词典构建分词器 tokenizer = Tokenizer(token_dict, do_lower_case=True) return tokenizer, keep_tokens
parse.add_argument('-t','--TEST_DATA_PATH',default=os.path.join(sys.path[0],'test.txt'),help='测试数据路径') parse.add_argument('-c','--BERT_CONFIG',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','bert_config.json'),help='bert配置路径') parse.add_argument('-m','--BERT_MODEL',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','bert_model.ckpt'),help='bert模型路径') parse.add_argument('-v','--BERT_VOCAB',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'chinese_L-12_H-768_A-12','vocab.txt'),help='bert模型词汇表') parse.add_argument('-ck','--MODEL_PATH',default=os.path.join(os.path.dirname(os.path.dirname(sys.path[0])),'checkpoints','model.h5'),help='模型保存路径') parse.add_argument('-l','--BERT_LAYER',default='Transformer-11-FeedForward-Norm',help='bert模型修改层') parse.add_argument('-b','--BATCH_SIZE',default=32,help='batch size') parse.add_argument('-e','--EPOCHS',default=2,help='epochs') parse.add_argument('-M','--MAX_LEN',default=68,help='文本句长') args = parse.parse_args() _labels = ['TIME','LOC','PER','ORG'] _labels_num = len(_labels)*2 + 1 # 加载分词器 token_dict = load_vocab(dict_path=args.BERT_VOCAB) tokenizer = Tokenizer(token_dict=token_dict) token_head = tokenizer._token_start_id token_end = tokenizer._token_end_id def id_label_dict(): """ 标注与数字映射词典 :return: """ id2label = dict(enumerate(_labels)) label2id = {} for k,v in id2label.items(): label2id[v] = k return label2id,id2label
# 模型配置 maxlen = 128 batch_size = 32 num_classes = 2 epochs = 20 # bert配置 config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) def load_data(filenames): D = [] for filename in filenames: with open(filename, encoding='utf-8') as f: for l in f: text, label = l.strip().split('\t') if len(text) <= maxlen - 2: D.append((text, int(label))) else: tmp = ''
from bert4keras.tokenizers import Tokenizer , load_vocab import json import numpy as np dict_path = "vocab.txt" tokenizer = Tokenizer(load_vocab(dict_path)) maskID = tokenizer.token_to_id(tokenizer._token_mask) def write_Json(content,fileName): with open(fileName,"w") as f: json.dump(content,f,indent=2) def read_json(fileName): fp = open(fileName,"r") f = json.load(fp) return f def cal_mask(inputs,corrupts,labels): assert inputs.shape == corrupts.shape and corrupts.shape == labels.shape masked = (labels == 1) correct = (inputs == corrupts) masked = masked.astype(np.float) correct = correct.astype(np.float) mask = masked * correct return mask
continue data.append((wrong1, right1)) except Exception as err: print(line) return data all_data = load_data(corpus_path) random.shuffle(all_data) valid_data = all_data[:len(all_data) // 8] train_data = all_data[len(all_data) // 8:] # 加载精简词表 token_dict, keep_words = load_vocab( dict_path=vocab_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']) tokenizer = Tokenizer(token_dict, do_lower_case=True) class MyDataGenerator(DataGenerator): def __iter__(self, random=True): """ 单条样本格式: [cls]错误词汇[sep][mask][mask]..[sep] :param random: :return: """ batch_tokens_ids, batch_segment_ids, batch_right_token_ids = [], [], [] for is_end, D in self.sample(random): wrong, right = D
# 后半部分不能包含禁止词 __, last_part = line.split(':') ignore_flag = False for dis_word in disallowed_words: if dis_word in last_part: ignore_flag = True break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表
from transformers import * pretrained_weights = 'bert-base-chinese' pre_tokenizer = BertTokenizer.from_pretrained(pretrained_weights) # tokenizer.vocab_size = 21128 # parameters maxlen = 64 batch_size = 128 epochs = 99999 ser = 'dango' # setting vocabulary config_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/home/'+ser+'/STC3/code/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' token_dict, keep_tokens = load_vocab(dict_path=dict_path, simplified=True, startswith=['[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[PAD]', '[UNK]', '[CLS]', '[SEP]'],) # [unused1]: Other, [unused2]: Like, [unused3]: Sadness, [unused4]: Disgust, [unused5]: Anger, [unused6]: Happiness tokenizer = Tokenizer(token_dict, do_lower_case=True) # reading data questions, answers, answer_ids = [], [], [] f = open('/home/'+ser+'/STC3/data/questions.txt','r',encoding='gbk') lines = f.readlines() for line in lines: line = line.strip() questions.append(line) f.close() f = open('/home/'+ser+'/STC3/data/answers.txt','r',encoding='gbk') lines = f.readlines() for line in lines: line = line.strip()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--config_path', type=str, required=True, help='BERT配置文件路径') parser.add_argument('--checkpoint_path', type=str, required=True, help='BERT权重路径') parser.add_argument('--dict_path', type=str, required=True, help='词表路径') parser.add_argument('--train_data_path', type=str, required=True, help='训练集路径') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch_size') parser.add_argument('--lr', default=1e-5, type=float, required=False, help='学习率') parser.add_argument('--topk1', default=25, type=int, required=False, help='最大长度') parser.add_argument('--topk2', default=2, type=int, required=False, help='最大长度') parser.add_argument('--max_seq_len', default=256, type=int, required=False, help='最大长度') args = parser.parse_args() print('args:\n' + args.__repr__()) maxlen = args.max_seq_len config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path batch_size = args.batch_size epochs = args.epochs topk1 = args.topk1 topk2 = args.topk2 num_classes = 2 lr = args.lr config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path train_data = args.train_data_path token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) train_df = pd.read_csv(train_data, sep='\t', header=None) train_df.columns = ['s1', 's2', 'label'] class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, r=False): idxs = list(range(len(self.data))) np.random.shuffle(idxs) batch_token_ids, batch_segment_ids, batch_labels = [], [], [] for i in idxs: line = self.data.loc[i] if (random.random() < 0.5): s1 = line['s1'].replace('***', '*') s2 = line['s2'].replace('***', '*') else: s2 = line['s1'].replace('***', '*') s1 = line['s2'].replace('***', '*') token_ids, segment_ids = tokenizer.encode(s1, s2, max_length=maxlen) batch_token_ids.append(token_ids) batch_segment_ids.append(segment_ids) batch_labels.append([line['label']]) if len(batch_token_ids) == self.batch_size or i == idxs[-1]: batch_token_ids = sequence_padding(batch_token_ids) batch_segment_ids = sequence_padding(batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [batch_token_ids, batch_segment_ids, batch_labels], None batch_token_ids, batch_segment_ids, batch_labels = [], [], [] class CrossEntropy(Loss): """交叉熵作为loss,并mask掉padding部分 """ def compute_loss(self, inputs, mask=None): y_true, y_pred = inputs if mask[1] is None: y_mask = 1.0 else: y_mask = K.cast(mask[1], K.floatx())[:, 1:] y_true = y_true[:, 1:] # 目标token_ids y_pred = y_pred[:, :-1] # 预测序列,错开一位 loss = K.sparse_categorical_crossentropy(y_true, y_pred) loss = K.sum(loss * y_mask) / K.sum(y_mask) return loss c_in = Input(shape=(1, )) c = Embedding(num_classes, maxlen)(c_in) c = Reshape((maxlen, ))(c) model = build_transformer_model( config_path, checkpoint_path, application='lm', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 layer_norm_cond=c, additional_input_layers=c_in, ) output = CrossEntropy(1)([model.inputs[0], model.outputs[0]]) model = Model(model.inputs, output) model.compile(optimizer=Adam(lr)) model.summary() def random_generate(c=0, n=2, s1_topk=5): """随机采样生成句子对 每次从最高概率的topk个token中随机采样一个 """ label_ids = [[c] for _ in range(n)] target_ids = [[2] for _ in range(n)] sep_index = [0 for _ in range(n)] R = [] for i in range(64): segment_ids = [] for t, index in zip(target_ids, sep_index): if index > 0: segment_ids.append([0] * index + [1] * (len(t) - index)) else: segment_ids.append([0] * len(t)) # 下面直接忽略[PAD], [UNK], [CLS] _probas = model.predict([target_ids, segment_ids, label_ids])[:, -1, 3:] for j, p in enumerate(_probas): p_arg_topk = p.argsort()[::-1][:s1_topk] #if 0 in p_arg_topk: # target_ids[j].append(3) #else: p_topk = p[p_arg_topk] p = p_topk / sum(p_topk) idx = np.random.choice(len(p), p=p) target_ids[j].append(p_arg_topk[idx] + 3) if p_arg_topk[idx] + 3 == 3 and sep_index[j] == 0: sep_index[j] = i for tokens in target_ids: tokens.append(3) cls_index = tokens.index(3) R.append(tokenizer.decode(tokens[:cls_index])) #sentences.sort(key = lambda i:len(i),reverse=True) return R def gen_sent(s, label, topk=2): """beam search解码 每次只保留topk个最优候选结果;如果topk=1,那么就是贪心搜索 """ label_ids = [[label] for _ in range(topk)] token_ids, segment_ids = tokenizer.encode(s) target_ids = [[] for _ in range(topk)] # 候选答案id target_scores = [0] * topk # 候选答案分数 for i in range(64): # 强制要求输出不超过max_output_len字 _target_ids = [token_ids + t for t in target_ids] _segment_ids = [segment_ids + [1] * len(t) for t in target_ids] _probas = model.predict([_target_ids, _segment_ids, label_ids])[:, -1, 3:] # 直接忽略[PAD], [UNK], [CLS] _log_probas = np.log(_probas + 1e-6) # 取对数,方便计算 _topk_arg = _log_probas.argsort(axis=1)[:, -topk:] # 每一项选出topk _candidate_ids, _candidate_scores = [], [] for j, (ids, sco) in enumerate(zip(target_ids, target_scores)): # 预测第一个字的时候,输入的topk事实上都是同一个, # 所以只需要看第一个,不需要遍历后面的。 if i == 0 and j > 0: continue for k in _topk_arg[j]: _candidate_ids.append(ids + [k + 3]) _candidate_scores.append(sco + _log_probas[j][k]) _topk_arg = np.argsort(_candidate_scores)[-topk:] # 从中选出新的topk target_ids = [_candidate_ids[k] for k in _topk_arg] target_scores = [_candidate_scores[k] for k in _topk_arg] best_one = np.argmax(target_scores) if target_ids[best_one][-1] == 3: return tokenizer.decode(target_ids[best_one]) # 如果max_output_len字都找不到结束符,直接返回 return tokenizer.decode(target_ids[np.argmax(target_scores)]) def gen_sen_pair(label, n, s1_topk, s2_topk): s1_pair = random_generate(label, n, s1_topk) output = [] for line in s1_pair: s2 = gen_sent(line, label, s2_topk) output.append([line, s2]) return output class Evaluate(keras.callbacks.Callback): def __init__(self): self.lowest = 1e10 def on_epoch_end(self, epoch, logs=None): # 保存最优 if logs['loss'] <= self.lowest: self.lowest = logs['loss'] model.save_weights('./best_model.weights') print("正样本:") print(gen_sen_pair(1, 2, topk1, topk2)) print("负样本:") print(gen_sen_pair(0, 2, topk1, topk2)) train_generator = data_generator(train_df, batch_size) evaluator = Evaluate() model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator])