def sta_test_x(self): if self.__sta_test_x is None: # 从文件加载 try: df = pd.read_csv(PATH_CONFIG.get('file_sta_test')) self.__sta_test_x = df[self.__sta_ltdfCol].values except: print('重新计算统计数据 - test') #texts = src2.d_org_data.org_test_data() texts = self.test_org_texts() self.__sta_test_x = self.create_sta_data( texts, PATH_CONFIG.get('file_sta_test')) return self.__sta_test_x
def cut_test_text(): try: lt_cut_text = read_cut_file(PATH_CONFIG.get('file_cut_test')) except: print('测试数据首次加载...') lt_org_data = src2.d_org_data.org_test_data() lt_cut_text = [ ' '.join(jieba.cut(text, cut_all=False)) for text in lt_org_data ] lt_cut_text = [ a.replace('\n', ' ').replace('\r', ' ') for a in lt_cut_text ] # 顺手保存一下 save_cut_file(PATH_CONFIG.get('file_cut_test'), lt_cut_text) return lt_cut_text
def __init__(self, f_mgr): self.f_mgr = f_mgr self.w2v_matrix = self.f_mgr.w2v_matrix() self.map_id_to_label = self.f_mgr.map_id_to_label() self.config = src2.ProConfig.ProConfig() self.config.w2v_vocab_size = len(self.w2v_matrix) self.config.cix_vocab_size = self.f_mgr.cix_vocab_size self.config.f2_dimension = len((self.f_mgr.f2_train_x())[0]) self.model = src2.ModelCnn.ModleCnn_3(self.config) self.session = tf.Session() self.session.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess=self.session, save_path=PATH_CONFIG.get('file_cnn_best')) # 读取保存的模型
def process(): # 创建数据管理类 f_mgr = src2.features.Features(src2.ProConfig.ProConfig()) # 创建工作模型类 run = CnnPredict(f_mgr) print('加载测试数据...') test_voc_x = f_mgr.voc_test_x() test_w2v_x = f_mgr.w2v_test_x() test_f2_x = f_mgr.f2_test_x() test_cix_x = f_mgr.cix_test_x() print('加载测试数据id号 (从原始数据)...') test_id = src2.d_org_data.org_test_index() # 生成过程 print('开始生成结果...') data_len = len(test_w2v_x) batch_size = 20 num_batch = int((data_len - 1) / batch_size) + 1 lt_ids = [] for i in range(num_batch): if int((i * 1.0 / num_batch) * 100) % 10 == 0: print("%d / %d" % (i, num_batch)) bng_id = i * batch_size end_id = min((i + 1) * batch_size, data_len) d1 = test_voc_x[bng_id:end_id] d2 = test_w2v_x[bng_id:end_id] d3 = test_f2_x[bng_id:end_id] d4 = test_cix_x[bng_id:end_id] lt_ids += run.predict_data(d1, d2, d3, d4) print('结果存入文件...') lines = '\n'.join([str(tid) + ',' + run.map_id2label(id) for tid, id in zip(test_id, lt_ids)]) # save_path = PATH_CONFIG.get('file_sub') + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + '.csv' save_path = PATH_CONFIG.get('file_sub') with codecs.open(save_path, "w", "utf-8")as f: f.write(lines) f.write('\n') print('完成: ' + save_path)
def org_test_index(): df = pd.read_json(PATH_CONFIG.get('file_org_test'), lines=True).set_index('id') return list(df.index.values)
def org_train_data(): df = pd.read_json(PATH_CONFIG.get('file_org_train'), lines=True).set_index('id') return list(df['内容'].values), list(df['标签'].values)
def save_path(self): return '%s/w2v_%d.mod' % (PATH_CONFIG.get('dir_w2v'), self.__vec_size)
def process(): print('载入数据...') start_time = time.time() # 建立全局配置 config = src2.ProConfig.ProConfig() # 数据管理类 f_mgr = src2.features.Features(config) # 获得特征向量 train_voc_x = f_mgr.voc_train_2_x() train_cix_x = f_mgr.cix_train_2_x() config.cix_vocab_size = f_mgr.cix_vocab_size train_w2v_x = f_mgr.w2v_train_2_x() w2v_matrix = f_mgr.w2v_matrix() config.w2v_vocab_size = len(w2v_matrix) train_f2_x = f_mgr.f2_train_2_x() config.f2_dimension = len(train_f2_x[0]) # 获得训练数据指标 train_y = f_mgr.train_2_y() # 分割出部分验证集 train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, valid_voc_x, valid_w2v_x, valid_f2_x, valid_cix_x, valid_y = \ get_train_validation(train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, 120) time_dif = get_time_dif(start_time) print("数据载入完成:", time_dif) # 开始训练 # model = src2.ModelCnn.ModleCnn_1(config) model = src2.ModelCnn.ModleCnn_3(config) # tensorboard配置 tf.summary.scalar("loss", model.loss) tf.summary.scalar("accuracy", model.acc) merged_summary = tf.summary.merge_all() writer = tf.summary.FileWriter(PATH_CONFIG.get('dir_cnn_tb')) # 模型存储Saver saver = tf.train.Saver() # 创建session # session = tf.Session(config=tf.ConfigProto(log_device_placement=True)) session = tf.Session() session.run(tf.global_variables_initializer()) writer.add_graph(session.graph) print('开始执行训练...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 best_acc_tra = 0.0 best_los_tra = 0.0 best_los_val = 0.0 learning_rate = config.learning_rate last_improved = 0 # 记录上一次提升批次 last_learn_rate_dec = 0 require_improvement = 8000 # 如果超过1000轮未提升,提前结束训练 learnrate_dec_rounds = 2000 # 如果超过1000轮未提升,提前结束训练 flag = False for epoch in range(config.epochs_num): print('Epoch:', epoch + 1) batch_train = batch_iter(train_voc_x, train_w2v_x, train_f2_x, train_cix_x, train_y, config.batch_size) for x1_batch, x2_batch, x3_batch, x4_batch, y_batch in batch_train: feed_dict = feed_data(model, x1_batch, x2_batch, x3_batch, x4_batch, y_batch, config.dropout_keep_prob, w2v_matrix, learning_rate) if total_batch % config.save_per_batch == 0: # 每多少轮次将训练结果写入tensorboard scalar s = session.run(merged_summary, feed_dict=feed_dict) writer.add_summary(s, total_batch) if total_batch % config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = evaluate(session, model, valid_voc_x, valid_w2v_x, valid_f2_x, valid_cix_x, valid_y, w2v_matrix, learning_rate) # todo if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val best_acc_tra = acc_train best_los_tra = loss_train best_los_val = loss_val last_improved = total_batch saver.save(sess=session, save_path=PATH_CONFIG.get('file_cnn_best')) improved_str = '*' elif acc_val == best_acc_val: if (acc_train > best_acc_tra and loss_train <= best_los_tra and loss_val <= best_los_val) or \ (acc_train >= best_acc_tra and loss_train < best_los_tra and loss_val <= best_los_val) or \ (acc_train >= best_acc_tra and loss_train <= best_los_tra and loss_val < best_los_val): # 保存最好结果 best_acc_val = acc_val best_acc_tra = acc_train best_los_tra = loss_train best_los_val = loss_val last_improved = total_batch saver.save(sess=session, save_path=PATH_CONFIG.get('file_cnn_best')) improved_str = '*' else: improved_str = '' else: improved_str = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) session.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: print("验证集长时间没有提升,强制训练结束...") flag = True break # 跳出循环 if total_batch - last_improved > learnrate_dec_rounds and total_batch - last_learn_rate_dec > learnrate_dec_rounds: learning_rate = max(learning_rate * config.learning_rate_dec_rate, config.learning_rate_min) last_learn_rate_dec = total_batch print("%d轮无结果,降低学习率%f" % (total_batch - last_improved, learning_rate)) if flag: # 同上 break session.close()
def save_path(self): return '%s/tfidf_%d.mod' % (PATH_CONFIG.get('dir_tfidt'), self.__vec_size)