import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: print("Name:", gpu.name, " Type:", gpu.device_type) tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) print(tf.__version__) base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\' config_path = base_path + 'bert_config.json' checkpoint_path = base_path + 'bert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
class Albert(object): def __init__(self, mode='inference', mode_='part', model_name=None, dataset_name=None): self.maxlen = 32 self.albert_config_path = '/Data/public/Bert/albert_tiny_489k/albert_config_tiny.json' self.albert_checkpoint_path = '/Data/public/Bert/albert_tiny_489k/albert_model.ckpt' self.albert_dict_path = '/Data/public/Bert/albert_tiny_489k/vocab.txt' self.train_data_path = 'data/train_{}.csv'.format(dataset_name) self.dev_data_path = 'data/dev_{}.csv'.format(dataset_name) self.test_data_path = 'data/test_{}.csv'.format(dataset_name) # albert_tiny_250k.h5 挺好的 # self.restore_model_path = 'saved_models/test_albert_tiny_{}.h5'.format(model_name) self.restore_model_path = '/Data/models/{}'.format(model_name) # albert self.albert_process_data(mode_) if mode == 'train': self.model = self._get_model() self.train() elif mode == 'inference': self._init_model() # todo keep words 工业场景下需要remove def albert_process_data(self, mode='part'): _token_dict = load_vocab(self.albert_dict_path) # 读取字典 # 只取涉及数据集中出现的字 if mode == 'part': train_df = pd.read_csv(self.train_data_path, names=['seq1', 'seq2', 'label']) valid_df = pd.read_csv(self.dev_data_path, names=['seq1', 'seq2', 'label']) test_df = pd.read_csv(self.test_data_path, names=['seq1', 'seq2', 'label']) # total data tmp_df = pd.concat([train_df, valid_df, test_df]) chars = defaultdict(int) for _, tmp_row in tmp_df.iterrows(): for tmp_char in tmp_row.seq1: chars[tmp_char] += 1 for tmp_char in tmp_row.seq2: chars[tmp_char] += 1 # 过滤低频字 chars = {i: j for i, j in chars.items() if j >= 4} self.token_dict, self.keep_words = {}, [] # keep_words是在bert中保留的字表 # 保留特殊字符 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) # 字典只保留数据中出现的高频字 for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) elif mode == 'full': self.token_dict, self.keep_words = _token_dict, [] for k in self.token_dict: self.keep_words.append(self.token_dict[k]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 # data pre-processing operation def _data_preprocessing(self, sentence1, sentence2): X1, X2 = [], [] for tmp_sent1, tmp_sent2 in zip(sentence1, sentence2): x1, x2 = self.tokenizer.encode(first=tmp_sent1[:self.maxlen], second=tmp_sent2[:self.maxlen]) X1.append(x1) X2.append(x2) X1 = self._seq_padding(X1) X2 = self._seq_padding(X2) # X1 = pad_sequences(X1, maxlen=67, padding='post', truncating='post') # X2 = pad_sequences(X2, maxlen=67, padding='post', truncating='post') return X1, X2 def _seq_padding(self, X, padding=0): L = [len(x) for x in X] ML = max(L) padded_sent = np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) return padded_sent # prepare data for training def _prepare_data(self, data_path): data = pd.read_csv(data_path) sent_1 = data['sentence1'].values sent_2 = data['sentence2'].values label = data['label'].values X1_pad, X2_pad = self._data_preprocessing(sent_1, sent_2) # X1 = np.vstack((X1_pad, X2_pad)) # X2 = np.vstack((X2_pad, X1_pad)) # y_train = np.hstack((label, label)) return X1_pad, X2_pad, label # albert for Semantic matching, model architecture def _get_model(self): model = load_pretrained_model( self.albert_config_path, self.albert_checkpoint_path, keep_words=self.keep_words, # 只保留keep_words中的字,精简原字表 albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) return model # model training operation def train(self): # train_data train_x1, train_x2, train_label = self._prepare_data( self.train_data_path) # dev_data dev_x1, dev_x2, dev_label = self._prepare_data(self.dev_data_path) checkpoint = ModelCheckpoint(self.restore_model_path, monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=False) early_stop = EarlyStopping(monitor='val_accuracy', patience=3, verbose=0, mode='auto', baseline=None, restore_best_weights=True) self.model.compile( loss='binary_crossentropy', optimizer=Adam(1e-4), # 用足够小的学习率 metrics=['accuracy']) self.model.summary() self.model.fit(x=[train_x1, train_x2], y=train_label, batch_size=64, epochs=10, verbose=1, callbacks=[checkpoint, early_stop], validation_data=([dev_x1, dev_x2], dev_label)) # model predict operation def predict(self, sentence1, sentence2): X1, X2 = self._data_preprocessing(sentence1, sentence2) y_pred = self.model.predict([X1, X2], batch_size=1024) return y_pred def test(self): self.model.compile( loss='binary_crossentropy', optimizer=Adam(1e-4), # 用足够小的学习率 metrics=['accuracy']) # test_data test_x1, test_x2, test_label = self._prepare_data(self.dev_data_path) test_loss, test_acc = self.model.evaluate(x=[test_x1, test_x2], y=test_label) print('test loss: {}'.format(test_loss)) print('test acc: {}'.format(test_acc)) def _init_model(self): self.model = load_model(self.restore_model_path) sentence1 = '干嘛呢' sentence2 = '你是机器人' print('model albert loaded succeed. ({})'.format( self.predict([sentence1], [sentence2]).item()))
class SemanticModel(): def __init__(self, batch_size=32, train=False): self.batch_size = batch_size if train: chars = set() train_datas = read_datas(TRAIN_DATA_FILE) dev_datas = read_datas(DEV_DATA_FILE) test_datas = read_datas(TEST_DATA_FILE) for text1, text2, label in itertools.chain(train_datas, dev_datas): chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 self.token_dict, self.keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(self.tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(self.keep_words, f) else: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: self.tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: self.keep_words = pickle.load(f) self.model = self.make_model() def make_model(self): model = load_pretrained_model(config_path, checkpoint_path, keep_words=self.keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) # print(output.shape) output = Dense(1, activation='sigmoid')(output) # tanh, sigmoid, softmax model = Model(inputs=model.input, outputs=output) model.compile( loss= 'binary_crossentropy', # categorical_crossentropy binary_crossentropy optimizer=Adam(2e-6), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() return model def gnerator_data(self, file_name): X1, X2, Y = [], [], [] while True: for text1, text2, label in read_datas(file_name): text1 = text1[:INPUT_LENGTH] text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) y = int(label) X1.append(x1) X2.append(x2) Y.append([y]) # Y.append(to_categorical(y)) if len(X1) == self.batch_size: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) # print(X1.shape, X2.shape, Y.shape) yield [X1, X2], Y X1, X2, Y = [], [], [] def train(self): early_stopping = EarlyStopping(monitor='val_loss', patience=3) model_checkpoint = ModelCheckpoint(filepath=os.path.join( model_save_path, 'similarity-{epoch:02d}-{val_loss:.2f}-{val_acc:.3f}.hdf5'), save_best_only=True, save_weights_only=False) tb = TensorBoard( log_dir=log_dir, # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 batch_size=32, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=False, # 是否可视化梯度直方图 write_images=False, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) hist = self.model.fit_generator( self.gnerator_data(TRAIN_DATA_FILE), steps_per_epoch=1000, epochs=100, validation_data=self.gnerator_data(DEV_DATA_FILE), validation_steps=100, callbacks=[early_stopping, model_checkpoint, tb]) print(hist.history.items()) def predict(self, text1, text2, weitht_file='similarity-01-0.55-0.741.hdf5'): self.model.load_weights(os.path.join(model_save_path, weitht_file), by_name=True, skip_mismatch=True, reshape=True) text1 = text1[:INPUT_LENGTH] text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) X1 = seq_padding([x1]) X2 = seq_padding([x2]) ret = self.model.predict([X1, X2]) return ret def batch_predict(self, question, database): text1 = question text1 = text1[:INPUT_LENGTH] X1, X2 = [], [] for text2 in database: text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) X1.append(x1) X2.append(x2) X1 = seq_padding(X1) X2 = seq_padding(X2) ret = self.model.predict([X1, X2]) return ret
from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh' # albert_model_path = '/notebooks/albert_zh/albert_large_zh' # https://storage.googleapis.com/albert_zh/albert_large_zh.zip config_path = os.path.join(albert_model_path, 'albert_config_large.json') checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt') dict_path = os.path.join(albert_model_path, 'vocab.txt') token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 # token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京') print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids)) # mask掉“技术” # token_ids[3] = token_ids[4] = token_dict['[MASK]'] token_ids[4] = token_ids[5] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] # print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术” print(tokenizer.decode(probas.argmax(axis=1)))
class AlbertClassify: def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True def train(self, train_data, valid_data): train_D = DataGenerator(train_data, self.tokenizer, CONFIG['batch_size'], CONFIG['max_len']) valid_D = DataGenerator(valid_data, self.tokenizer, CONFIG['batch_size'], CONFIG['max_len']) output = Lambda(lambda x: x[:, 0])(self.albert_model.output) output = Dense(1, activation='sigmoid')(output) self.model = Model(self.albert_model.input, output) save = ModelCheckpoint(os.path.join(self.model_path), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=3, verbose=1, mode='auto') callbacks = [save, early_stopping] if self.initial_model: x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x_in = self.albert_model([x1_in, x2_in]) x_in = Lambda(lambda x: x[:, 0])(x_in) p = Dense(1, activation='sigmoid')(x_in) self.model = Model([x1_in, x2_in], p) else: self.model = self.albert_model self.model.compile( loss='binary_crossentropy', # optimizer=RAdam(1e-5), # 用足够小的学习率 optimizer=PiecewiseLinearLearningRate(Adam(1e-5), { 1000: 1e-5, 2000: 6e-5 }), metrics=[ 'accuracy', process.get_precision, process.get_recall, process.get_f1 ]) self.model.summary() self.model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=CONFIG['epochs'], validation_data=valid_D.__iter__(), validation_steps=len(valid_D), callbacks=callbacks, use_multiprocessing=CONFIG['use_multiprocessing'], ) def predict(self, test_data): """ 预测 :param test_data: :return: """ X1 = [] X2 = [] for s in test_data: x1, x2 = self.tokenizer.encode(first=s[:CONFIG['max_len']]) X1.append(x1) X2.append(x2) X1 = seq_padding(X1) X2 = seq_padding(X2) predict_results = self.model.predict([X1, X2]) return predict_results def load(self, model_path): """ load the pre-trained model """ try: self.albert_model = load_model(str(model_path), custom_objects=get_custom_objects(), compile=False) except Exception as ex: print('load error') return self
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]) base_path = 'D:\AI\Data\\albert_large_zh\\' config_path = base_path + 'albert_config_large.json' checkpoint_path = base_path + 'albert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, albert=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”权