class ALBertEmbedding(ModelBase): '''通过ALBert计算句向量 ''' def __init__(self, config_path=const.ALBERT_CONFIG_PATH, albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, dict_path = const.ALBERT_DICT_PATH, train_mode=False ): self.session = tf.Session() keras.backend.set_session(self.session) if train_mode: self.bert = build_bert_model( model='albert', config_path=config_path, checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) else: self.bert = build_bert_model( model='albert', config_path=config_path, # checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.tokenizer = Tokenizer(dict_path, do_lower_case=True) self.encoder.load_weights(albert_checkpoint_path, by_name=True) def init(self, words_list=None, update=True): if words_list!=None: token_ids_list, segment_ids_list = [], [] for words in words_list: token_ids, segment_ids = self.tokenizer.encode(words) token_ids_list.append(token_ids) segment_ids_list.append(segment_ids) token_ids_list = sequence_padding(token_ids_list) segment_ids_list = sequence_padding(segment_ids_list) self.words_list_pre = self.encoder.predict([token_ids_list, segment_ids_list]) self.words_list_pre = self._normalize(self.words_list_pre) return self def _predict(self, words): with self.session.as_default(): with self.session.graph.as_default(): token_ids, segment_ids = self.tokenizer.encode( words ) pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])]) pre = self._normalize(pre) return pre # 句向量 def predict(self, words): with self.session.as_default(): with self.session.graph.as_default(): token_ids, segment_ids = self.tokenizer.encode( words ) pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])]) pre = self._normalize(pre) return np.dot( self.words_list_pre[:], pre[0] )
class SiameseDataGenerator(DataGenerator): """ SiameseBert的数据生成器,生成的数据组成为: """ def __init__(self, data_path: str, batch_size: int, maxlen: int, dict_path: str): super().__init__(data=self.__load_data(data_path), batch_size=batch_size) self._tokenizer = Tokenizer(dict_path, do_lower_case=True) self._maxlen = maxlen @staticmethod def __load_data(filename: str): D = [] with open(filename, encoding='utf-8') as f: for line in f: category, text1, text2, label = line.strip().split(',') if category != 'category': # 过滤掉columns数据行 D.append((text1, text2, int(label))) return D def __iter__(self, random=False): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \ batch_labels = [], [], [], [], [] for i in idxs: text1, text2, label = self.data[i] q1_token_ids, q1_segment_ids = self._tokenizer.encode( text1, max_length=self._maxlen) q2_token_ids, q2_segment_ids = self._tokenizer.encode( text2, max_length=self._maxlen) q1_batch_token_ids.append(q1_token_ids) q2_batch_token_ids.append(q2_token_ids) q1_batch_segment_ids.append(q1_segment_ids) q2_batch_segment_ids.append(q2_segment_ids) batch_labels.append([label]) if len(batch_labels) == self.batch_size or i == idxs[-1]: q1_batch_token_ids = sequence_padding(q1_batch_token_ids) q2_batch_token_ids = sequence_padding(q2_batch_token_ids) q1_batch_segment_ids = sequence_padding(q1_batch_segment_ids) q2_batch_segment_ids = sequence_padding(q2_batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [ q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids ], batch_labels q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \ batch_labels = [], [], [], [], []
config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path min_count = 0 max_input_len = args.max_input_len max_output_len = args.max_output_len batch_size = args.batch_size epochs = args.epochs topk = args.topk train_data_path = args.train_data_path val_data_path = args.val_data_path token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict, do_lower_case=True) # 建立分词器 sep_id = tokenizer.encode('')[0][-1] rouge = Rouge() model = get_model(config_path, checkpoint_path, args.albert, args.lr) evaluator = Evaluate(val_data_path, topk) model.fit_generator(DataGenerator(train_data_path, batch_size), epochs=epochs, callbacks=[evaluator])
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import build_bert_model from bert4keras.tokenizer import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_bert_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
from bert4keras.backend import keras from bert4keras.bert import build_bert_model from bert4keras.tokenizer import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166 0.5369075 ] [-0.7473459 0.49431565 0.7185162 ... 0.3848612 -0.74090636
samples = len(all_data) train_samples = int(samples * TRAIN_SPLIT) dev_samples = int(samples * DEV_SPLIT) train_data, train_labels = all_data[:train_samples], all_labels[:train_samples] dev_data, dev_labels = all_data[train_samples:train_samples + dev_samples], all_labels[ train_samples:train_samples + dev_samples] test_data, test_labels = all_data[train_samples + dev_samples:], all_labels[train_samples + dev_samples:] # 加载预训练模型的词典 _token_dict = load_vocab(DICT_PATH) _tokenizer = Tokenizer(_token_dict, do_lower_case=True) print(all_data[0]) print(_tokenizer.encode(all_data[0])) print(_tokenizer.tokenize(all_data[0])) print([_tokenizer.id_to_token(21934)]) print(_tokenizer.token_to_id('[PAD]')) # 统计数据集中的词频 counter = Counter() for line in all_data: _tokens = _tokenizer.tokenize(line) # 统计词频时,移除[CLS]和[SEP]字符 counter.update(_tokens[1:-1]) print(len(counter)) # 移除词频较低的词 _tokens = [ token for token, cnt in counter.items() if cnt >= MIN_WORD_FREQUENCY ]