def test(): bp = bert_params() #token_dict = load_vocab(bp.bert_vocab) t = Tokenizer(bp.bert_vocab) return t
def test_bert_params(): '''测试bert模型导入数据是否正确''' '''测试注意事项:with_pool默认为False,在call中注释掉经过最后的Dense层''' bp = bert_params() #with_pool默认为False #dg = DataGenerator(bp,batch_size = 2,num_neg = 2,shuffle = True) my_model = Bert4QA(bp) tokenizer = Tokenizer(bp.bert_vocab, do_lower_case=True) token_ids, segment_ids = tokenizer.encode("语言模型") print("token_ids:", token_ids) print('\n ===== predicting =====\n') token_ids = tf.convert_to_tensor([token_ids], dtype=tf.float32) segment_ids = tf.convert_to_tensor([segment_ids], dtype=tf.float32) ans = my_model.call([token_ids, segment_ids]) print(ans)
def build_model(with_pool=False, with_nsp=False, with_mlm=False): bp = bert_params(with_pool=with_pool, with_nsp=with_nsp, with_mlm=with_mlm) print("bp.with_mlm:", bp.with_mlm) bert_model = Bert(bp.vocab_size, bp.hidden_size, bp.num_hidden_layers, bp.num_attention_heads, bp.intermediate_size, bp.hidden_act, bp.max_position_embeddings, bp.hidden_dropout_prob, bp.attention_probs_dropout_prob, bp.with_pool, bp.with_nsp, bp.with_mlm) inputs = [Input((None, )), Input((None, ))] outputs = bert_model(inputs) #如果是预测模型的话,outputs是个列表 #print(outputs.shape) bert_model.load_weights_from_checkpoint(bp.bert_ckpt) return bert_model
def main(): #模型训练预测时的参数 mp = model_params() #构建bert的参数 bp = bert_params(with_pool=True) datagen = DataGenerator(bp, batch_size=mp.batch_size, num_neg=mp.num_neg, shuffle=mp.shuffle) #后续再尝试用其它的优化器 optimizer = Adagrad(learning_rate=mp.learning_rate) my_model = Bert4QA(bp) #训练主类 t = TrainOrPredict(mp) #final_model就是训练好的模型 final_model = t.train(my_model, optimizer, datagen) data = datagen.data_faq tokenizer = datagen.tokenizer #训练完成后查看效果 real_query_text = "月球和地球是什么关系?" question_score = {} for query_name in data.query_dict.keys(): query_text = data.query_dict[query_name] token_ids, segment_ids = tokenizer.encode(real_query_text, query_text) question_score[query_name] = final_model.predict( [token_ids, segment_ids]) question_score = {k: v.numpy() for k, v in question_score.items()} qs = dict(sorted(question_score.items(), key=lambda x: x[1], reverse=True)) c = 0 for k, v in qs.items(): c += 1 print(k, data.query_dict[k], v) if c == 10: break return final_model
def test(): mp = model_params() bp = bert_params(with_pool=True) #其实这里内部不用导入一遍bert源码数据 my_model = Bert4QA(bp) tokenizer = Tokenizer(bp.bert_vocab, do_lower_case=True) optimizer = Adagrad(learning_rate=mp.learning_rate) #注意参数名一定要和保存时的参数名一致 ckpt = tf.train.Checkpoint(optimizer=optimizer, model=my_model) ckpt.restore(tf.train.latest_checkpoint("./save_checkpoint/")) #后面的和导入数据没有关系 data_faq = DataBasic(bp.FAQ_file_path) real_query_text = "月球和地球是什么关系?" real_query_text = "月球和地球的关系" real_query_text = "月球是地球的卫星吗" print("实际查询:", real_query_text) question_score = {} for query_name in data_faq.query_dict.keys(): query_text = data_faq.query_dict[query_name] token_ids, segment_ids = tokenizer.encode(real_query_text, query_text) question_score[query_name] = my_model.predict([token_ids, segment_ids]) question_score = {k: v.numpy() for k, v in question_score.items()} qs = dict(sorted(question_score.items(), key=lambda x: x[1], reverse=True)) for k, v in qs.items(): print(k, data_faq.query_dict[k], v) return qs
bp.with_pool, bp.with_nsp, bp.with_mlm) inputs = [Input((None, )), Input((None, ))] outputs = bert_model(inputs) #如果是预测模型的话,outputs是个列表 #print(outputs.shape) bert_model.load_weights_from_checkpoint(bp.bert_ckpt) return bert_model if __name__ == "__main__": bp = bert_params() print(bp.vocab_size) my_model = Bert(bp.vocab_size, bp.hidden_size, bp.num_hidden_layers, bp.num_attention_heads, bp.intermediate_size, bp.hidden_act, bp.max_position_embeddings, bp.hidden_dropout_prob, bp.attention_probs_dropout_prob, bp.with_pool, bp.with_nsp, bp.with_mlm) T = 256 inputs = [Input((None, )), Input((None, ))] outputs = my_model(inputs) #如果是预测模型的话,outputs是个列表 #print(outputs.shape)