def run(cls, action='predict', question='', question_list=None): log.info('action : %s' % action) if action == 'train': cls.train(cls) elif action == 'test': cls.test(cls) elif action == 'doc': return cls.predict_doc(cls, question_list) elif action == 'softmax': return cls.predict_doc_softmax(cls, question_list) else: return cls.predict(cls, question=question)
def pandas_read_sql(cls, sql): usePool=cls._db_inst_use_pool log.info("sql=" + sql) if usePool: return pd.read_sql(sql, con=cls.get_db_inst()) else: conn = cls.get_db_inst() con = conn.connect() try: with con: return pd.read_sql(sql, con=con) except Exception as e: log.error("Read from Database fialed: %s"%e) traceback.print_exc() finally: conn.dispose()
def initData(): que_label = None que_nolabel = None sql = 'select * from gx_regular_label' que_label = SqlalchemyUtil.pandas_read_sql(sql=sql) log.info('数据已经更新') sql = 'select * from gx_regular_nolabel' que_nolabel = SqlalchemyUtil.pandas_read_sql(sql=sql) Application.all_data = { 'gx_regular_label': que_label, 'gx_regular_nolabel': que_nolabel }
def tasker(): # log.debug('开启互斥锁') mutex.acquire() try: log.info('Application更新时间:%s' % Application.lastTime) if Application.lastTime != None: if int(time.time()) - int( Application.lastTime) < Application.updataTime: log.info('---无需更新数据---') return initData() Application.lastTime = time.time() Application.init_data = True Application.init_model = True finally: mutex.release() log.debug('释放互斥锁')
def LoadConfig(self, conf_path): log.info('配置文件路径', os.path.abspath(conf_path)) if self.config_path != conf_path: self.config_path = conf_path self.cf.read(conf_path) # DB db_sec = self.cf['db'] self.db_type = db_sec.get("type", "mysql") self.db_driver = db_sec.get("driver", "pymysql") self.db_host = db_sec.get('host') self.db_port = db_sec.get('port', '3306') self.db_user = db_sec.get('user') self.db_passwd = db_sec.get('passwd') self.db_name = db_sec.get('dbname', '') self.db_charset = db_sec.get('charset', 'utf8') self.db_insertInterval = db_sec.get('insertInterval') self.db_verStartTime = db_sec.get('verStartTime')
def train(self): Continue = False model = TextCnnModel(self.config, keep_prob=self.config.dropout_keep_prob) # 配置 Saver if not os.path.exists(TextCnnConfig.save_dir): os.makedirs(TextCnnConfig.save_dir) # 载入训练集与验证集 start_time = time.time() data = dataHelper.process_file( Application.all_data, vocab_dir=TextCnnConfig.vocab_dir, categories_dir=TextCnnConfig.categories_dir, max_length=TextCnnConfig.seq_length) train_data, val_data = dataHelper.build_train_val( data, reset=True) #全量用True,test是用FALSE # 创建session with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if Continue: model.saver.restore(sess=sess, save_path=TextCnnConfig.save_path) Continue = False print('Training and evaluating...') start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1000 # 如果超过1000轮未提升,提前结束训练 loss_val = 0.0 acc_val = 0.0 flag = False for epoch in range(self.config.num_epochs): print('Train Epoch:', epoch + 1) print(val_data.shape) # train_data, val_data = self.reset_train_val(train_data,val_data) for batch_x, batch_y in dataHelper.batch_iter(train_data): if total_batch >= 800: model.saver.save(sess=sess, save_path=TextCnnConfig.save_path) break feed_dict = { model.input_x: batch_x, model.input_y: batch_y } if total_batch % self.config.print_per_batch == 0: # 每多少轮次输出在训练集和验证集上的性能 # feed_dict[model.keep_prob] = 1.0 loss_train, acc_train = sess.run( [model.loss, model.acc], feed_dict=feed_dict) loss_val, acc_val = self.evaluate( sess, val_data, model) # todo # if (acc_val + 0.01) > best_acc_val and abs(acc_train-acc_val)<0.02: if acc_val >= best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch os.chdir( r"G:\项目3—广西文本分类\GxClassify\Model\TextCnnModel\checkpoints\best_validation" ) print(os.getcwd()) model.saver.save(sess=sess, save_path="./best_validation") improved_str = '*' else: improved_str = '' time_dif = self.get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) sess.run(model.optim, feed_dict=feed_dict) # 运行优化 total_batch += 1 if total_batch - last_improved > require_improvement: # 验证集正确率长期不提升,提前结束训练 log.warning( "No optimization for a long time, auto-stopping..." ) flag = True break # 跳出循环 if flag: # 同上 break # model.saver.save(sess=sess, save_path=TextCnnConfig.save_path) #最终模型 log.info('train finish use time :%s' % (self.get_time_dif(start_time)))
def __init__(self): log.info('Configuring CNN model...')
def test(self): model = TextCnnModel(self.config, keep_prob=1) start_time = time.time() with open(self.config.test_dir, 'rb') as f: test_data = pickle.load(f) test_size, _ = test_data.shape _, categories = dataHelper.read_category( categories_dir=TextCnnConfig.categories_dir) sess = tf.Session() sess.run(tf.global_variables_initializer()) model.saver.restore(sess=sess, save_path=TextCnnConfig.save_path) # 读取保存的模型 loss_test, acc_test = self.evaluate(sess, test_data, model) msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}' print(msg.format(loss_test, acc_test)) # y_pred_cls = np.zeros(shape=test_size, dtype=np.int32) # 保存预测结果 y_pred_cls = [] y_test_cls = None for x, y in dataHelper.batch_iter(test_data, batch_size=test_size): # 逐批次处理 feed_dict = { model.input_x: x, } y_pred_cls = sess.run(model.y_pred_cls, feed_dict=feed_dict) y_test_cls = np.argmax(y, 1) print(y_pred_cls) # 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test_cls, y_pred_cls, target_names=list(categories))) wc = [] wct = [] y_test_cls = y_test_cls.tolist() y_pred_cls = y_pred_cls.tolist() for i in range(len(y_pred_cls)): # wc.append(y_pred_cls[i]) # wct.append(y_test_cls[i]) if y_pred_cls[i] - y_test_cls[i] != 0: wc.append(y_pred_cls[i]) wct.append(y_test_cls[i]) print(wc) print(wct) print(len(wc)) # print(dataHelper.category_id([wc],TextCnnConfig.categories_dir)) # print(dataHelper.category_id([wct], TextCnnConfig.categories_dir)) # label_id , _ = dataHelper.read_category(TextCnnConfig.categories_dir) # print(label_id) # test_data = test_data.reset_index() # test_data['label_p'] = pandas.Series(np.array(dataHelper.category_id([wc],TextCnnConfig.categories_dir))) # test_data['label_T'] = pandas.Series(np.array(dataHelper.category_id([wct], TextCnnConfig.categories_dir))) # print(test_data) # # test_data = test_data.drop('x',1) # test_data = test_data.drop('y', 1) # print(test_data) # SqlalchemyUtil.pandas_to_sql(test_data,table_name='q_label_test') time_dif = self.get_time_dif(start_time) log.info("Time usage:", time_dif)