def make_model(keep_words, label2id, train=False): model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) class_num = len(label2id) # output = Attention(512, name='attention_1')(model.output) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(class_num, activation='softmax')(output) model = Model(model.input, output) # if train: # # 微调albert的顶部几层; # model.trainable = True # set_trainable = False # for layer in model.layers: # if layer.name == 'Encoder-1-FeedForward-Norm': # 'attention_1': # set_trainable = True # if set_trainable: # layer.trainable = True # else: # layer.trainable = False model.compile( loss='categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) # 保存模型图 plot_model(model, 'classify-albert.png') model.summary() return model
def predict(fold=0): # from accum_optimizer import AccumOptimizer model = load_pretrained_model( config_path, checkpoint_path, seq2seq=False, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) x_in = keras.Input(shape=(None, ), name='Token') s_in = keras.Input(shape=(None, ), name='Segment') output = model([x_in, s_in]) output = keras.layers.core.Lambda(lambda x: x[:, 0, :])(output) output = keras.layers.Dense(2, activation='sigmoid')(output) model = keras.Model([x_in, s_in], output) opt = Adam(5e-6) model.compile(opt, loss=[focal_loss(alpha=0.85)], metrics=['accuracy']) if fold == 0: model.summary() save_dir = join(MODEL_PATH, 'bert_wwm_aug_focal/out_{}'.format(fold)) # save_dir = join(MODEL_PATH, 'bert_res/out_{}'.format(fold)) save_path = join(save_dir, 'trained.ckpt') model.load_weights(save_path) logger.info('predict...') results = model.predict([trains_x, trains_s], batch_size=batch_size) print('result shape: {}'.format(results.shape)) assert len(results) == dev.shape[0] keras.backend.clear_session() return results[:, 1]
def predict(text): with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: keep_words = pickle.load(f) model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() model.load_weights(os.path.join(model_save_path, 'checkpoint-02-0.15-0.939.hdf5'), by_name=True, skip_mismatch=True, reshape=True) text = text[:maxlen] x1, x2 = tokenizer.encode(first=text) X1 = seq_padding([x1]) X2 = seq_padding([x2]) ret = model.predict([X1, X2]) return ret
def _get_model(self): model = load_pretrained_model( self.albert_config_path, self.albert_checkpoint_path, keep_words=self.keep_words, # 只保留keep_words中的字,精简原字表 albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) return model
def make_model(keep_words): model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(inputs=model.input, outputs=output) model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() return model
def train(train_data, valid_data, tokenizer, keep_words): model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() train_D = data_generator(train_data, tokenizer=tokenizer) valid_D = data_generator(valid_data, tokenizer=tokenizer) early_stopping = EarlyStopping(monitor='val_loss', patience=3) model_checkpoint = ModelCheckpoint(filepath=os.path.join( model_save_path, 'checkpoint-{epoch:02d}-{val_loss:.2f}-{val_accuracy:.3f}.hdf5'), save_best_only=True, save_weights_only=False) tb = TensorBoard( log_dir=log_dir, # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 batch_size=32, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=False, # 是否可视化梯度直方图 write_images=False, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) model.fit_generator(train_D.__iter__(), steps_per_epoch=len(train_D), epochs=5, validation_data=valid_D.__iter__(), validation_steps=len(valid_D), callbacks=[early_stopping, model_checkpoint, tb])
def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True
def make_model(keep_words, label2id): model = load_pretrained_model( config_path, checkpoint_path, keep_words=keep_words, albert=True ) class_num = len(label2id) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(class_num, activation='softmax')(output) model = Model(model.input, output) model.compile( loss='categorical_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy'] ) model.summary() return model
def build_model(keep_words, ner_units=None, rel_units=None): # construct model model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) # dense = Dense(200, activation='relu')(output) # dense = BatchNormalization()(dense) ner_out = CRF(ner_units, sparse_target=True, name='ner_out')(model.output) # dense = Lambda(lambda x: x, output_shape=lambda s: s)(dense) # attention_out = Attention(200, name='attention_1')(dense) # lambda_out = Lambda(lambda x: x[:, 0])(dense) # lambda_out = BatchNormalization()(lambda_out) rel_out = Dense(rel_units, activation='softmax', name='rel_out')(output) model = Model(model.input, outputs=[ner_out, rel_out]) # model.compile(optimizer='rmsprop', loss='binary_crossentropy', loss_weights=[1., 0.2]) # lr不能太低,如:5e-7,太低了,反而会使模型训练还没达到最优就提前终止了; model.compile(optimizer=Adam(lr=5e-6), loss={ 'ner_out': crf_loss, 'rel_out': 'categorical_crossentropy' }, metrics={ 'ner_out': crf_viterbi_accuracy, 'rel_out': 'accuracy' }, loss_weights={ 'ner_out': 0.5, 'rel_out': 0.5 }) model.summary() # 保存模型图 plot_model(model, 'ner_classify_albert_tiny.png') return model
def creat_model(config, keep_words): """构建模型""" model = load_pretrained_model( config.bert_config, config.bert_checkpoint, #config.pretrained_model_path, seq2seq=True, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) # 交叉熵作为loss,并mask掉输入部分的预测 # 目标tokens y_in = model.input[0][:, 1:] y_mask = model.input[1][:, 1:] # 预测tokens,预测与目标错开一位 y = model.output[:, :-1] cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5)) return model
def build_model(self): model = load_pretrained_model(config_path, checkpoint_path, keep_words=self.keep_words, albert=True) # output = Lambda(lambda x: x[:, 0])(model.output) output = CRF(len(self.label2id), sparse_target=True)(model.output) model = Model(model.input, output) model.compile( loss=crf_loss, optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=[crf_accuracy]) # 保存模型图 plot_model(model, 'ner-albert.png') model.summary() # print('model.input_shape: {}, model.output_shape: {}'.format(model.input_shape, model.output_shape)) return model
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: print("Name:", gpu.name, " Type:", gpu.device_type) tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) print(tf.__version__) base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\' config_path = base_path + 'bert_config.json' checkpoint_path = base_path + 'bert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]) base_path = 'D:\AI\Data\\albert_large_zh\\' config_path = base_path + 'albert_config_large.json' checkpoint_path = base_path + 'albert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, albert=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”权
Y.append([y]) if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], [] from keras.layers import * from keras.models import Model import keras.backend as K from keras.optimizers import Adam model = load_pretrained_model(config_path, checkpoint_path, keep_words=keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() train_D = data_generator(train_data) valid_D = data_generator(valid_data)
if len(X1) == self.batch_size or i == idxs[-1]: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) yield [X1, X2], Y [X1, X2, Y] = [], [], [] from keras.layers import * from keras.models import Model import keras.backend as K from keras.optimizers import Adam model = load_pretrained_model( config_path, checkpoint_path, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) model.compile( loss='binary_crossentropy', optimizer=Adam(1e-5), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() train_D = data_generator(train_data)
import os from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh' # albert_model_path = '/notebooks/albert_zh/albert_large_zh' # https://storage.googleapis.com/albert_zh/albert_large_zh.zip config_path = os.path.join(albert_model_path, 'albert_config_large.json') checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt') dict_path = os.path.join(albert_model_path, 'vocab.txt') token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 # token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京') print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids)) # mask掉“技术” # token_ids[3] = token_ids[4] = token_dict['[MASK]'] token_ids[4] = token_ids[5] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] # print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术” print(tokenizer.decode(probas.argmax(axis=1)))
def data_generator(): while True: X, S = [], [] for a, b in read_text(): x, s = tokenizer.encode(a, b) X.append(x) S.append(s) if len(X) == batch_size: X = padding(X) S = padding(S) yield [X, S], None X, S = [], [] model = load_pretrained_model(config_path, checkpoint_path, seq2seq=True, keep_words=keep_words) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5))
def train(fold=0, only_predict=False, need_val=True): # from accum_optimizer import AccumOptimizer if fold in []: only_predict = True model = load_pretrained_model( config_path, checkpoint_path, seq2seq=False, keep_words=keep_words, # 只保留keep_words中的字,精简原字表 ) x_in = keras.Input(shape=(None, ), name='Token') s_in = keras.Input(shape=(None, ), name='Segment') output = model([x_in, s_in]) output = keras.layers.core.Lambda(lambda x: x[:, 0, :])(output) output = keras.layers.Dense(2, activation='sigmoid')(output) model = keras.Model([x_in, s_in], output) opt = Adam(5e-6) model.compile(opt, loss=[focal_loss(alpha=0.85)], metrics=['accuracy']) if fold == 0: model.summary() save_dir = join(MODEL_PATH, 'bert_res/out_{}'.format(fold)) if not os.path.exists(save_dir): os.makedirs(save_dir) if not only_predict and init_epoch == 0: for l in os.listdir(save_dir): # 删空 os.remove(join(save_dir, l)) save_path = join(save_dir, 'trained.ckpt') checkpoint_callback = keras.callbacks.ModelCheckpoint( save_path, monitor='val_loss', verbose=0, save_best_only=False, save_weights_only=True, mode='min', period=1) tensorboard_callback = keras.callbacks.TensorBoard(log_dir=save_dir, histogram_freq=0, write_graph=False, write_grads=False, update_freq=320) # weight_decay_callback = keras.callbacks.LearningRateScheduler( # schedule=lambda epoch, lr: lr * (epochs - epoch) / epochs if epoch > 0 else 1e-6 # ) if only_predict: model.load_weights(save_path) else: if init_epoch > 0: logger.info('Continue train. Load weight...') model.load_weights(save_path) model.fit_generator( data_generator(fold, True), validation_data=data_generator(fold, False), validation_steps=100, steps_per_epoch=steps_per_epoch, epochs=epochs, verbose=2, # workers=3, class_weight=None, initial_epoch=init_epoch, callbacks=[ checkpoint_callback, tensorboard_callback, # weight_decay_callback, LogRecord() ]) if need_val: logger.info('evaluate...') if only_predict: # 这个执行一次,用于重新统计 eva_len next(data_generator(fold, False)) eva_result = model.evaluate_generator(data_generator(fold, False), steps=int(eva_len / batch_size)) else: eva_result = [] logger.info('predict...') results = model.predict([trains_x, trains_s], batch_size=batch_size) print('result shape: {}'.format(results.shape)) assert len(results) == test.shape[0] keras.backend.clear_session() return results[:, 1], eva_result