def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0): """ 测试集测试与模型评估 :param hyper_parameters: json, 超参数 :param path_test:str, path of test data, 测试集 :param rate: 比率, 抽出rate比率语料取训练 :return: None """ hyper_parameters = load_json(path_hyper_parameter) if path_test: # 从外部引入测试数据地址 hyper_parameters['data']['val_data'] = path_test time_start = time.time() # graph初始化 graph = Graph(hyper_parameters) print("graph init ok!") graph.load_model() print("graph load ok!") ra_ed = graph.word_embedding # 数据预处理 pt = PreprocessText() y, x = read_and_process(hyper_parameters['data']['val_data']) # 取该数据集的百分之几的语料测试 len_rate = int(len(y) * rate) x = x[1:len_rate] y = y[1:len_rate] y_pred = [] count = 0 for x_one in x: count += 1 ques_embed = ra_ed.sentence2idx(x_one) if hyper_parameters['embedding_type'] == 'bert': # bert数据处理, token x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] else: x_val = ques_embed # 预测 pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) label_pred = pre[0][0][0] if count % 1000 == 0: print(label_pred) y_pred.append(label_pred) print("data pred ok!") # 预测结果转为int类型 index_y = [pt.l2i_i2l['l2i'][i] for i in y] index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred] target_names = [ pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y))) ] # 评估 report_predict = classification_report(index_y, index_pred, target_names=target_names, digits=9) print(report_predict) print("耗时:" + str(time.time() - time_start))
def pred_input(): hyper_parameters = { 'model': { 'label': 17, 'batch_size': 64, 'embed_size': 30, 'filters': [2, 3, 4], 'filters_num': 30, 'channel_size': 1, 'dropout': 0.5, 'decay_step': 100, 'decay_rate': 0.9, 'epochs': 20, 'len_max': 50, 'vocab_size': 20000, # 这里随便填的,会根据代码里修改 'lr': 1e-3, 'l2': 1e-6, 'activate_classify': 'softmax', 'embedding_type': 'bert', # 还可以填'random'、 'bert' or 'word2vec" 'is_training': True, 'model_path': path_model_fast_text_baiduqa_2019, # 地址可以自己设置 'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU' 'rnn_units': 256, # large 650, small is 300 }, 'embedding': { 'embedding_type': 'bert', 'corpus_path': path_embedding_bert, 'level_type': 'char', 'embed_size': 30, 'len_max': 50, 'layer_indexes': [12] # range 1 to 12 }, } pt = PreprocessText() graph = Graph(hyper_parameters) graph.load_model() ra_ed = graph.word_embedding ques = '我要打王者荣耀' ques_embed = ra_ed.sentence2idx(ques) x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) print(pre) while True: print("请输入: ") ques = input() ques_embed = ra_ed.sentence2idx(ques) print(ques_embed) ques_embed_1 = np.array([ques_embed[0]]) ques_embed_2 = np.array([ques_embed[1]]) pred = graph.predict([ques_embed_1, ques_embed_2]) pre = pt.prereocess_idx(pred[0]) print(pre)
def pred_input(path_hyper_parameter=path_hyper_parameters): """ 输入预测 :param path_hyper_parameter: str, 超参存放地址 :return: None """ # 加载超参数 hyper_parameters = load_json(path_hyper_parameter) pt = PreprocessSim(path_model_dir) # 模式初始化和加载 graph = Graph(hyper_parameters) graph.load_model() ra_ed = graph.word_embedding sen1 = '我要打王者荣耀' sen2 = '我要打梦幻西游' # str to token ques_embed = ra_ed.sentence2idx(text=sen1, second_text=sen2) if hyper_parameters['embedding_type'] in ['bert', 'albert']: x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] # 预测 pred = graph.predict(x_val) # 取id to label and pred pre = pt.prereocess_idx(pred[0]) print(pre) while True: print("请输入sen1: ") sen1 = input() print("请输入sen2: ") sen2 = input() ques_embed = ra_ed.sentence2idx(text=sen1, second_text=sen2) print(ques_embed) if hyper_parameters['embedding_type'] in ['bert', 'albert']: x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) print(pre) else: print("error, just support bert or albert") else: print("error, just support bert or albert")
def pred_input(path_hyper_parameter=path_hyper_parameters): """ 输入预测 :param path_hyper_parameter: str, 超参存放地址 :return: None """ # 加载超参数 hyper_parameters = load_json(path_hyper_parameter) pt = PreprocessText() # 模式初始化和加载 graph = Graph(hyper_parameters) graph.load_model() ra_ed = graph.word_embedding ques = '我要打王者荣耀' # str to token ques_embed = ra_ed.sentence2idx(ques) if hyper_parameters['embedding_type'] == 'bert': x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] else: x_val = ques_embed # 预测 pred = graph.predict(x_val) # 取id to label and pred pre = pt.prereocess_idx(pred[0]) print(pre) while True: print("请输入: ") ques = input() ques_embed = ra_ed.sentence2idx(ques) print(ques_embed) if hyper_parameters['embedding_type'] == 'bert': x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] else: x_val = ques_embed pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) print(pre)
def train(hyper_parameters=None, rate=1.0): """ 训练函数 :param hyper_parameters: json, 超参数 :param rate: 比率, 抽出rate比率语料取训练 :return: None """ if not hyper_parameters: hyper_parameters = { 'len_max': 32, # 句子最大长度, 固定 推荐20-50 'embed_size': 768, # 字/词向量维度 'vocab_size': 20000, # 这里随便填的,会根据代码里修改 'trainable': True, # embedding是静态的还是动态的, 即控制可不可以微调 'level_type': 'char', # 级别, 最小单元, 字/词, 填 'char' or 'word' 'embedding_type': 'bert', # 级别, 嵌入类型, 还可以填'xlnet'、'random'、 'bert'、 'albert' or 'word2vec" 'gpu_memory_fraction': 0.76, #gpu使用率 'model': { 'label': 2, # 类别数 'batch_size': 2, # 批处理尺寸, 感觉原则上越大越好,尤其是样本不均衡的时候, batch_size设置影响比较大 'filters': [2, 3, 4, 5], # 卷积核尺寸 'filters_num': 300, # 卷积个数 text-cnn:300-600 'channel_size': 1, # CNN通道数 'dropout': 0.5, # 随机失活, 概率 'decay_step': 100, # 学习率衰减step, 每N个step衰减一次 'decay_rate': 0.9, # 学习率衰减系数, 乘法 'epochs': 20, # 训练最大轮次 'patience': 3, # 早停,2-3就好 'lr': 5e-5, # 学习率, 对训练会有比较大的影响, 如果准确率一直上不去,可以考虑调这个参数 'l2': 1e-9, # l2正则化 'activate_classify': 'sigmoid', # 最后一个layer, 即分类激活函数 'loss': 'binary_crossentropy', # 损失函数 'metrics': 'accuracy', # 保存更好模型的评价标准 'is_training': True, # 训练后者是测试模型 'model_path': path_model, # 模型地址, loss降低则保存的依据, save_best_only=True, save_weights_only=True 'path_hyper_parameters': path_hyper_parameters, # 模型(包括embedding),超参数地址, 'path_fineture': path_fineture, # 保存embedding trainable地址, 例如字向量、词向量、bert向量等 }, 'embedding': { 'layer_indexes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], # bert取的层数,包括embedding层 # 'corpus_path': '', # embedding预训练数据地址,不配则会默认取conf里边默认的地址, keras-bert可以加载谷歌版bert,百度版ernie(需转换,https://github.com/ArthurRizar/tensorflow_ernie),哈工大版bert-wwm(tf框架,https://github.com/ymcui/Chinese-BERT-wwm) }, 'data': { 'train_data': path_sim_webank_train, # 训练数据 'val_data': path_sim_webank_valid # 验证数据 }, } # 删除先前存在的模型\embedding微调模型等 delete_file(path_model_dir) time_start = time.time() # graph初始化 graph = Graph(hyper_parameters) print("graph init ok!") ra_ed = graph.word_embedding # 数据预处理 pt = PreprocessSim() x_train, y_train = pt.preprocess_label_ques_to_idx( hyper_parameters['embedding_type'], hyper_parameters['data']['train_data'], ra_ed, rate=rate, shuffle=True) x_val, y_val = pt.preprocess_label_ques_to_idx( hyper_parameters['embedding_type'], hyper_parameters['data']['val_data'], ra_ed, rate=rate, shuffle=True) print("data propress ok!") print(len(y_train)) # 训练 graph.fit(x_train, y_train, x_val, y_val) print("耗时:" + str(time.time() - time_start))
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0): """ 测试集测试与模型评估 :param hyper_parameters: json, 超参数 :param path_test:str, path of test data, 测试集 :param rate: 比率, 抽出rate比率语料取训练 :return: None """ hyper_parameters = load_json(path_hyper_parameter) if path_test: # 从外部引入测试数据地址 hyper_parameters['data']['test_data'] = path_test time_start = time.time() # graph初始化 graph = Graph(hyper_parameters) print("graph init ok!") graph.load_model() print("graph load ok!") ra_ed = graph.word_embedding # 数据预处理 pt = PreprocessSim(path_model_dir) data = pd.read_csv(hyper_parameters['data']['test_data']) sentence_1 = data["sentence1"].values.tolist() sentence_2 = data["sentence2"].values.tolist() labels = data["label"].values.tolist() sentence_1 = [extract_chinese(str(line1).upper()) for line1 in sentence_1] sentence_2 = [extract_chinese(str(line2).upper()) for line2 in sentence_2] labels = [extract_chinese(str(line3).upper()) for line3 in labels] # 取该数据集的百分之几的语料测试 len_rate = int(len(labels) * rate) sentence_1 = sentence_1[0:len_rate] sentence_2 = sentence_2[0:len_rate] labels = labels[0:len_rate] y_pred = [] count = 0 for i in range(len_rate): count += 1 ques_embed = ra_ed.sentence2idx(text=sentence_1[i], second_text=sentence_2[i]) # print(hyper_parameters['embedding_type']) if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] # 预测 pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) label_pred = pre[0][0][0] if count % 1000 == 0: print(label_pred) y_pred.append(label_pred) print("data pred ok!") # 预测结果转为int类型 index_y = [pt.l2i_i2l['l2i'][i] for i in labels] index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred] target_names = [ pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y))) ] # 评估 report_predict = classification_report(index_y, index_pred, target_names=target_names, digits=9) print(report_predict) print("耗时:" + str(time.time() - time_start))
'rnn_units': 256, # large 650, small is 300 }, 'embedding': { 'embedding_type': 'bert', 'corpus_path': path_embedding_bert, 'level_type': 'char', 'embed_size': 30, 'len_max': 50, 'layer_indexes': [12] # range 1 to 12 }, } import time time_start = time.time() graph = Graph(hyper_parameters) print("graph init ok!") ra_ed = graph.word_embedding pt = PreprocessText() x_train, y_train = pt.preprocess_baidu_qa_2019_idx( path_baidu_qa_2019_train, ra_ed, rate=0.01) x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed, rate=0.01) indexs = [ids for ids in range(len(y_train))] random.shuffle(indexs) x_train, y_train = x_train[indexs], y_train[indexs] print("data propress ok 1 !") print(len(y_train)) x_train = x_train.tolist() x_val = x_val.tolist()
def pred_tet(): # 测试集的准确率 from keras_textclassification.conf.path_config import path_baidu_qa_2019_valid hyper_parameters = { 'model': { 'label': 17, 'batch_size': 64, 'embed_size': 30, 'filters': [2, 3, 4], 'filters_num': 30, 'channel_size': 1, 'dropout': 0.5, 'decay_step': 100, 'decay_rate': 0.9, 'epochs': 20, 'len_max': 50, 'vocab_size': 20000, # 这里随便填的,会根据代码里修改 'lr': 1e-3, 'l2': 1e-6, 'activate_classify': 'softmax', 'embedding_type': 'bert', # 还可以填'random'、 'bert' or 'word2vec" 'is_training': True, 'model_path': path_model_fast_text_baiduqa_2019, # 地址可以自己设置 'rnn_type': 'GRU', # type of rnn, select 'LSTM', 'GRU', 'CuDNNGRU', 'CuDNNLSTM', 'Bidirectional-LSTM', 'Bidirectional-GRU' 'rnn_units': 256, # large 650, small is 300 }, 'embedding': { 'embedding_type': 'bert', 'corpus_path': path_embedding_bert, 'level_type': 'char', 'embed_size': 30, 'len_max': 50, 'layer_indexes': [12] # range 1 to 12 }, } pt = PreprocessText() graph = Graph(hyper_parameters) graph.load_model() ra_ed = graph.word_embedding x_val, y_val = pt.preprocess_baidu_qa_2019_idx(path_baidu_qa_2019_valid, ra_ed, rate=1) x_val = x_val.tolist() y_val = y_val.tolist() y_pred = [] count = 0 for x_val_one in x_val: count = count + 1 print(x_val_one) ques_embed_1 = np.array([x_val_one[0]]) ques_embed_2 = np.array([x_val_one[1]]) pred = graph.predict([ques_embed_1, ques_embed_2]) print(pred) pred_top1 = pt.prereocess_pred_id(pred[0]) print(pred_top1) y_pred.append(pred_top1) acc = 0 for i in range(len(y_val)): if y_val[i] == y_pred[i]: acc += 1 print('true: {} total: {} acc: {}'.format(acc, len(y_val), acc / len(y_val)))