def train_per_nn(self, verbose=False): """ nn_list에 있는 nn 각각을 total_epoch까지 학습한 뒤, 앙상블 (전제조건: n_class == 3) verbose: False 이면 각각 nn에 대한 정보 적게 출력 """ ensemble_metrics = { 'val': { 'score_seq': [] }, 'test': { 'score_seq': [] } } # nn 각각 total_epoch까지 학습 for nn in self.nn_list: nn.train( self.ds, self.batch_size, self.total_epoch, self.feature_shuffle, # swell_t-1를 제외한 피쳐 셔플 여부 self.train_all_data, # val, test 데이터를 학습에 사용할 지 여부 verbose) print("\n[Ensembled Model Testing]") for i in range(self.total_epoch): print("[Ensemble EPOCH: {}]".format(i)) val_softmax = np.zeros((len(self.ds['val']['x']), 3)) test_softmax = np.zeros((len(self.ds['test']['x']), 3)) problem_softmax = np.zeros((len(self.ds['problem']['x']), 3)) # 각각의 nn 모델의 softmax 결과값을 합산한뒤, argmax로 최종 예측값 도출 for nn in self.nn_list: val_softmax += nn.predicts['val_softmax'][i] test_softmax += nn.predicts['test_softmax'][i] problem_softmax += nn.predicts['problem_softmax'][i] val_pred_seq = np.argmax(val_softmax, axis=1) test_pred_seq = np.argmax(test_softmax, axis=1) problem_pred_seq = np.argmax(problem_softmax, axis=1) val_acc_seq, val_score_seq, val_max_score = util.calc_metric( self.ds['val']['y'].ravel(), val_pred_seq, 3) test_acc_seq, test_score_seq, test_max_score = util.calc_metric( self.ds['test']['y'].ravel(), test_pred_seq, 3) print( "[SUMMARY] val_acc_seq :{:.5} val_score_seq :{:.5} (max:{:.5})" .format(val_acc_seq, val_score_seq, val_max_score)) print("test_acc_seq :{:.5} test_score_seq :{:.5} (max:{:.5})". format(test_acc_seq, test_score_seq, test_max_score)) ensemble_metrics['val']['score_seq'].append(val_score_seq) ensemble_metrics['test']['score_seq'].append(test_score_seq) plot_metrics(**ensemble_metrics) # epoch당 앙상블 모델의 예측 결과를 엑셀파일로 저장 util.save_result_excel(problem_pred_seq, filename='result_' + str(i) + 'ep.xlsx')
def train_per_nn(self, sess, nn_list): """ nn_list에 있는 nn 각각을 total_epoch까지 학습한 뒤, 앙상블 """ ensemble_metrics = {'val': {'score': []}, 'test': {'score': []}} # nn 각각 total_epoch까지 학습 for nn in nn_list: nn.train( self.etler, -1, -1, self.batch_size, self.total_epoch, self.train_all_data, # val, test 데이터를 학습에 사용할 지 여부 ) self.save_model(sess) print("\n[Ensembled Model Testing]") for i in range(self.total_epoch): print("[Ensemble EPOCH: {}]".format(i)) val_pred = np.zeros((len(self.etler.val_input_df), 333)) test_pred = np.zeros((len(self.etler.test_input_df), 333)) problem_pred = np.zeros((len(self.etler.problem_input_df), 333)) # 각각의 nn 모델의 softmax 결과값을 합산한뒤, argmax로 최종 예측값 도출 for nn in nn_list: val_pred += nn.predicts['val']['x_pred'][i] test_pred += nn.predicts['test']['x_pred'][i] problem_pred += nn.predicts['problem']['x_pred'][i] val_pred = val_pred / len(nn_list) test_pred = test_pred / len(nn_list) problem_pred = problem_pred / len(nn_list) val_acc, val_score = util.calc_metric( self.etler.val_input_df.values, val_pred, self.etler.val_nan_pos, self.etler) test_acc, test_score = util.calc_metric( self.etler.test_input_df.values, test_pred, self.etler.test_nan_pos, self.etler) print('Validation Score') print(val_score) print('Test Score') print(test_score) print('[SUMMARY]') print( '[val ] score: {:.4} num_score: {:.4} cat_score: {:.4}'.format( val_score.mean(), val_score[self.etler.num_vars].mean(), val_score[self.etler.cat_vars].mean())) print( '[test] score: {:.4} num_score: {:.4} cat_score: {:.4}'.format( test_score.mean(), test_score[self.etler.num_vars].mean(), test_score[self.etler.cat_vars].mean())) ensemble_metrics['val']['score'].append(val_score.mean()) ensemble_metrics['test']['score'].append(test_score.mean()) plot_metrics(**ensemble_metrics) # epoch당 앙상블 모델의 예측 결과를 엑셀파일로 저장 # problem 데이터 예측 결과를 format에 맞게 만들기 problem_imputed_df = pd.DataFrame( np.array(problem_pred), columns=self.etler.problem_input_df.columns) problem_imputed_df = self.etler.generate_output_df( problem_imputed_df) result_df = util.fill_result_df(self.etler.result_df, problem_imputed_df) # 정답지를 result/result_#epoch.csv 에 저장 if not os.path.exists('result'): os.mkdir('result') result_df.to_csv(os.path.join('result', 'ensemble_{}epoch.csv'.format(i)), index=False, encoding='cp949')
def train(self, etler, drop_num_cols, drop_cat_cols, batch_size, total_epoch, train_all_data=False): # data to use train_input_df = etler.train_input_df val_input_df = etler.val_input_df test_input_df = etler.test_input_df problem_input_df = etler.problem_input_df result_df = etler.result_df val_nan_mask = etler.val_nan_mask test_nan_mask = etler.test_nan_mask val_nan_pos = etler.val_nan_pos test_nan_pos = etler.test_nan_pos problem_nan_mask = problem_input_df.notnull().values.astype( float) # nan -> 0, else -> 1 problem_input_df = problem_input_df.fillna(0) train_data = train_input_df.values val_data = val_input_df.values test_data = test_input_df.values problem_data = problem_input_df.values self.metrics = { 'train': { 'loss': [], 'score': [] }, 'val': { 'loss': [], 'score': [] }, 'test': { 'loss': [], 'score': [] } } self.predicts = { 'train': { 'x_pred': [] }, 'val': { 'x_pred': [] }, 'test': { 'x_pred': [] }, 'problem': { 'x_pred': [] } } for i in range(total_epoch): print('[NAME: {}, EPOCH: {}]'.format(self.name, i)) print('> Train...') train_nan_mask, train_nan_pos = etler.gen_random_nan_mask( len(train_data), drop_num_cols, drop_cat_cols) train_loss, train_num_loss, train_cat_loss, train_pred = self.run_batch( train_data, train_nan_mask, batch_size, is_training=True) train_acc, train_score = util.calc_metric(train_data, train_pred, train_nan_pos, etler) print('> Validation...') val_loss, val_num_loss, val_cat_loss, val_pred = self.run_batch( val_data, val_nan_mask, batch_size, is_training=train_all_data) val_acc, val_score = util.calc_metric(val_data, val_pred, val_nan_pos, etler) print(val_score) print('> Test...') test_loss, test_num_loss, test_cat_loss, test_pred = self.run_batch( test_data, test_nan_mask, batch_size, is_training=train_all_data) test_acc, test_score = util.calc_metric(test_data, test_pred, test_nan_pos, etler) print(test_score) print('[train] loss: {:.4} num_loss: {:.4} cat_loss: {:.4}'.format( train_loss, train_num_loss, train_cat_loss)) print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format( train_score.mean(), train_score[etler.num_vars].mean(), train_score[etler.cat_vars].mean())) print('[val] loss:{:.4} num_loss:{:.4} cat_loss:{:.4}'.format( val_loss, val_num_loss, val_cat_loss)) print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format( val_score.mean(), val_score[etler.num_vars].mean(), val_score[etler.cat_vars].mean())) print('[test] loss:{:.4} num_loss:{:.4} cat_loss:{:.4}'.format( test_loss, test_num_loss, test_cat_loss)) print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format( test_score.mean(), test_score[etler.num_vars].mean(), test_score[etler.cat_vars].mean())) print() problem_pred = self.problem_predict(problem_data, problem_nan_mask) # problem 데이터 예측 결과를 format에 맞게 만들기 problem_imputed_df = pd.DataFrame(np.array(problem_pred), columns=problem_input_df.columns) problem_imputed_df = etler.generate_output_df(problem_imputed_df) result_df = util.fill_result_df(result_df, problem_imputed_df) # 정답지를 result/result_#epoch.csv 에 저장 if not os.path.exists('result'): os.mkdir('result') result_df.to_csv(os.path.join('result', 'result_{}epoch.csv'.format(i)), index=False, encoding='cp949') self.metrics['train']['loss'].append(train_loss) self.metrics['train']['score'].append(train_score.mean()) self.metrics['val']['loss'].append(val_loss) self.metrics['val']['score'].append(val_score.mean()) self.metrics['test']['loss'].append(test_loss) self.metrics['test']['score'].append(test_score.mean()) self.predicts['train']['x_pred'].append(train_pred) self.predicts['val']['x_pred'].append(val_pred) self.predicts['test']['x_pred'].append(test_pred) self.predicts['problem']['x_pred'].append(problem_pred) plot_metrics(**self.metrics)
def train(self, ds, BATCH_SIZE, EPOCH, feature_shuffle=False, train_all_data=False, verbose=True): # feature 순서 랜덤 셔플. 단, 마지막 feature는 swell_t-1으로 고정 if feature_shuffle: p = np.random.permutation(ds['train']['x'].shape[2] - 1) ds['train']['x'][:, :, :len(p)] = ds['train']['x'][:, :, p] ds['val']['x'][:, :, :len(p)] = ds['val']['x'][:, :, p] ds['test']['x'][:, :, :len(p)] = ds['test']['x'][:, :, p] ds['problem']['x'][:, :, :len(p)] = ds['problem']['x'][:, :, p] # one hot ds['train']['y_onehot'] = to_one_hot(ds['train']['y']) ds['val']['y_onehot'] = to_one_hot(ds['val']['y']) ds['test']['y_onehot'] = to_one_hot(ds['test']['y']) # class weight for d0 in ['train', 'val', 'test']: # w0 = (ds[d0]['y'] == 0).sum() / len(ds[d0]['y']) # w1 = (ds[d0]['y'] == 1).sum() / len(ds[d0]['y']) # w2 = (ds[d0]['y'] == 2).sum() / len(ds[d0]['y']) # ds[d0]['w'] = np.array([1 / w0, 1 / w1, 1 / w2])[ds[d0]['y']] # ds[d0]['w'] = np.array([1 / 0.45, 1 / 0.36, 1 / 0.19])[ds[d0]['y']] diff_samples = ds[d0]['x'][:, -1, -1] != ds[d0]['y'][:, -1] ds[d0]['w'] = np.array([1, 2])[diff_samples.astype(int)] for i in range(EPOCH): print('[NAME: {}, EPOCH: {}]'.format(self.name, i)) # Train train_loss, _ = self.run_batch(ds['train']['x'], ds['train']['y_onehot'], ds['train']['w'], BATCH_SIZE, is_training=True) # Validation if verbose: print('predict ONE Validation') val_loss, val_pred_one = self.run_batch(ds['val']['x'], ds['val']['y_onehot'], ds['val']['w'], BATCH_SIZE, is_training=train_all_data) val_acc_one, val_score_one, val_max_score = util.calc_metric( ds['val']['y'].ravel(), val_pred_one.round().astype(int).ravel(), self.n_class, verbose) if verbose: print('predict SEQ Validation') val_pred_seq, val_softmax_seq = self.predict_sequence( ds['val']['x']) val_pred_seq = val_pred_seq.round().astype(int).ravel() val_acc_seq, val_score_seq, val_max_score = util.calc_metric( ds['val']['y'].ravel(), val_pred_seq, self.n_class, verbose) # Test if verbose: print('predict ONE Test') test_loss, test_pred_one = self.run_batch( ds['test']['x'], ds['test']['y_onehot'], ds['test']['w'], BATCH_SIZE, is_training=train_all_data) test_acc_one, test_score_one, test_max_score = util.calc_metric( ds['test']['y'].ravel(), test_pred_one.ravel().round().astype(int), self.n_class, verbose) if verbose: print('predict SEQ Test') test_pred_seq, test_softmax_seq = self.predict_sequence( ds['test']['x']) test_pred_seq = test_pred_seq.round().astype(int).ravel() test_acc_seq, test_score_seq, test_max_score = util.calc_metric( ds['test']['y'].ravel(), test_pred_seq, self.n_class, verbose) print( "[SUMMARY]\n(Loss) train: {:.5} val: {:.5} test: {:.5}".format( train_loss, val_loss, test_loss)) print("val_acc_seq : {:.5} val_score_seq : {:.5} (max: {:.5})". format(val_acc_seq, val_score_seq, val_max_score)) print("test_acc_seq: {:.5} test_score_seq: {:.5} (max: {:.5})\n". format(test_acc_seq, test_score_seq, test_max_score)) # append current epoch's metrics self.metrics['train']['loss'].append(train_loss) self.metrics['val']['loss'].append(val_loss) self.metrics['test']['loss'].append(test_loss) self.metrics['val']['score_seq'].append(val_score_seq) self.metrics['test']['score_seq'].append(test_score_seq) if verbose: plot_metrics(**self.metrics) # predict Problem problem_pred, problem_softmax = self.predict_sequence( ds['problem']['x']) problem_pred = problem_pred.astype(int).ravel() # append current epoch's predictions self.predicts['val'].append(val_pred_seq) self.predicts['test'].append(test_pred_seq) self.predicts['problem'].append(problem_pred) self.predicts['val_softmax'].append(val_softmax_seq) self.predicts['test_softmax'].append(test_softmax_seq) self.predicts['problem_softmax'].append(problem_softmax) # END for i in range(EPOCH): return self.predicts