Ejemplo n.º 1
0
    def train_per_nn(self, verbose=False):
        """
        nn_list에 있는 nn 각각을 total_epoch까지 학습한 뒤, 앙상블
        (전제조건: n_class == 3)
        verbose: False 이면 각각 nn에 대한 정보 적게 출력
        """
        ensemble_metrics = {
            'val': {
                'score_seq': []
            },
            'test': {
                'score_seq': []
            }
        }

        # nn 각각 total_epoch까지 학습
        for nn in self.nn_list:
            nn.train(
                self.ds,
                self.batch_size,
                self.total_epoch,
                self.feature_shuffle,  # swell_t-1를 제외한 피쳐 셔플 여부
                self.train_all_data,  # val, test 데이터를 학습에 사용할 지 여부
                verbose)

        print("\n[Ensembled Model Testing]")
        for i in range(self.total_epoch):
            print("[Ensemble EPOCH: {}]".format(i))
            val_softmax = np.zeros((len(self.ds['val']['x']), 3))
            test_softmax = np.zeros((len(self.ds['test']['x']), 3))
            problem_softmax = np.zeros((len(self.ds['problem']['x']), 3))

            # 각각의 nn 모델의 softmax 결과값을 합산한뒤, argmax로 최종 예측값 도출
            for nn in self.nn_list:
                val_softmax += nn.predicts['val_softmax'][i]
                test_softmax += nn.predicts['test_softmax'][i]
                problem_softmax += nn.predicts['problem_softmax'][i]

            val_pred_seq = np.argmax(val_softmax, axis=1)
            test_pred_seq = np.argmax(test_softmax, axis=1)
            problem_pred_seq = np.argmax(problem_softmax, axis=1)

            val_acc_seq, val_score_seq, val_max_score = util.calc_metric(
                self.ds['val']['y'].ravel(), val_pred_seq, 3)
            test_acc_seq, test_score_seq, test_max_score = util.calc_metric(
                self.ds['test']['y'].ravel(), test_pred_seq, 3)

            print(
                "[SUMMARY] val_acc_seq :{:.5}  val_score_seq :{:.5} (max:{:.5})"
                .format(val_acc_seq, val_score_seq, val_max_score))
            print("test_acc_seq :{:.5}  test_score_seq :{:.5} (max:{:.5})".
                  format(test_acc_seq, test_score_seq, test_max_score))

            ensemble_metrics['val']['score_seq'].append(val_score_seq)
            ensemble_metrics['test']['score_seq'].append(test_score_seq)
            plot_metrics(**ensemble_metrics)

            # epoch당 앙상블 모델의 예측 결과를 엑셀파일로 저장
            util.save_result_excel(problem_pred_seq,
                                   filename='result_' + str(i) + 'ep.xlsx')
Ejemplo n.º 2
0
    def train_per_nn(self, sess, nn_list):
        """
        nn_list에 있는 nn 각각을 total_epoch까지 학습한 뒤, 앙상블
        """

        ensemble_metrics = {'val': {'score': []}, 'test': {'score': []}}

        # nn 각각 total_epoch까지 학습
        for nn in nn_list:
            nn.train(
                self.etler,
                -1,
                -1,
                self.batch_size,
                self.total_epoch,
                self.train_all_data,  # val, test 데이터를 학습에 사용할 지 여부
            )
        self.save_model(sess)

        print("\n[Ensembled Model Testing]")
        for i in range(self.total_epoch):
            print("[Ensemble EPOCH: {}]".format(i))
            val_pred = np.zeros((len(self.etler.val_input_df), 333))
            test_pred = np.zeros((len(self.etler.test_input_df), 333))
            problem_pred = np.zeros((len(self.etler.problem_input_df), 333))

            # 각각의 nn 모델의 softmax 결과값을 합산한뒤, argmax로 최종 예측값 도출
            for nn in nn_list:
                val_pred += nn.predicts['val']['x_pred'][i]
                test_pred += nn.predicts['test']['x_pred'][i]
                problem_pred += nn.predicts['problem']['x_pred'][i]

            val_pred = val_pred / len(nn_list)
            test_pred = test_pred / len(nn_list)
            problem_pred = problem_pred / len(nn_list)

            val_acc, val_score = util.calc_metric(
                self.etler.val_input_df.values, val_pred,
                self.etler.val_nan_pos, self.etler)
            test_acc, test_score = util.calc_metric(
                self.etler.test_input_df.values, test_pred,
                self.etler.test_nan_pos, self.etler)
            print('Validation Score')
            print(val_score)
            print('Test Score')
            print(test_score)

            print('[SUMMARY]')
            print(
                '[val ] score: {:.4} num_score: {:.4} cat_score: {:.4}'.format(
                    val_score.mean(), val_score[self.etler.num_vars].mean(),
                    val_score[self.etler.cat_vars].mean()))
            print(
                '[test] score: {:.4} num_score: {:.4} cat_score: {:.4}'.format(
                    test_score.mean(), test_score[self.etler.num_vars].mean(),
                    test_score[self.etler.cat_vars].mean()))

            ensemble_metrics['val']['score'].append(val_score.mean())
            ensemble_metrics['test']['score'].append(test_score.mean())
            plot_metrics(**ensemble_metrics)

            # epoch당 앙상블 모델의 예측 결과를 엑셀파일로 저장
            # problem 데이터 예측 결과를 format에 맞게 만들기
            problem_imputed_df = pd.DataFrame(
                np.array(problem_pred),
                columns=self.etler.problem_input_df.columns)
            problem_imputed_df = self.etler.generate_output_df(
                problem_imputed_df)
            result_df = util.fill_result_df(self.etler.result_df,
                                            problem_imputed_df)

            # 정답지를 result/result_#epoch.csv 에 저장
            if not os.path.exists('result'):
                os.mkdir('result')
            result_df.to_csv(os.path.join('result',
                                          'ensemble_{}epoch.csv'.format(i)),
                             index=False,
                             encoding='cp949')
Ejemplo n.º 3
0
    def train(self,
              etler,
              drop_num_cols,
              drop_cat_cols,
              batch_size,
              total_epoch,
              train_all_data=False):

        # data to use
        train_input_df = etler.train_input_df
        val_input_df = etler.val_input_df
        test_input_df = etler.test_input_df
        problem_input_df = etler.problem_input_df
        result_df = etler.result_df

        val_nan_mask = etler.val_nan_mask
        test_nan_mask = etler.test_nan_mask
        val_nan_pos = etler.val_nan_pos
        test_nan_pos = etler.test_nan_pos

        problem_nan_mask = problem_input_df.notnull().values.astype(
            float)  # nan -> 0, else -> 1
        problem_input_df = problem_input_df.fillna(0)

        train_data = train_input_df.values
        val_data = val_input_df.values
        test_data = test_input_df.values
        problem_data = problem_input_df.values

        self.metrics = {
            'train': {
                'loss': [],
                'score': []
            },
            'val': {
                'loss': [],
                'score': []
            },
            'test': {
                'loss': [],
                'score': []
            }
        }

        self.predicts = {
            'train': {
                'x_pred': []
            },
            'val': {
                'x_pred': []
            },
            'test': {
                'x_pred': []
            },
            'problem': {
                'x_pred': []
            }
        }

        for i in range(total_epoch):
            print('[NAME: {}, EPOCH: {}]'.format(self.name, i))
            print('> Train...')
            train_nan_mask, train_nan_pos = etler.gen_random_nan_mask(
                len(train_data), drop_num_cols, drop_cat_cols)
            train_loss, train_num_loss, train_cat_loss, train_pred = self.run_batch(
                train_data, train_nan_mask, batch_size, is_training=True)
            train_acc, train_score = util.calc_metric(train_data, train_pred,
                                                      train_nan_pos, etler)

            print('> Validation...')

            val_loss, val_num_loss, val_cat_loss, val_pred = self.run_batch(
                val_data, val_nan_mask, batch_size, is_training=train_all_data)
            val_acc, val_score = util.calc_metric(val_data, val_pred,
                                                  val_nan_pos, etler)
            print(val_score)

            print('> Test...')
            test_loss, test_num_loss, test_cat_loss, test_pred = self.run_batch(
                test_data,
                test_nan_mask,
                batch_size,
                is_training=train_all_data)
            test_acc, test_score = util.calc_metric(test_data, test_pred,
                                                    test_nan_pos, etler)
            print(test_score)

            print('[train] loss: {:.4} num_loss: {:.4} cat_loss: {:.4}'.format(
                train_loss, train_num_loss, train_cat_loss))
            print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format(
                train_score.mean(), train_score[etler.num_vars].mean(),
                train_score[etler.cat_vars].mean()))
            print('[val]   loss:{:.4} num_loss:{:.4} cat_loss:{:.4}'.format(
                val_loss, val_num_loss, val_cat_loss))
            print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format(
                val_score.mean(), val_score[etler.num_vars].mean(),
                val_score[etler.cat_vars].mean()))
            print('[test]  loss:{:.4} num_loss:{:.4} cat_loss:{:.4}'.format(
                test_loss, test_num_loss, test_cat_loss))
            print('score: {:.4} num_score: {:.4} cat_score: {:.4}'.format(
                test_score.mean(), test_score[etler.num_vars].mean(),
                test_score[etler.cat_vars].mean()))
            print()

            problem_pred = self.problem_predict(problem_data, problem_nan_mask)

            # problem 데이터 예측 결과를 format에 맞게 만들기
            problem_imputed_df = pd.DataFrame(np.array(problem_pred),
                                              columns=problem_input_df.columns)
            problem_imputed_df = etler.generate_output_df(problem_imputed_df)
            result_df = util.fill_result_df(result_df, problem_imputed_df)

            # 정답지를 result/result_#epoch.csv 에 저장
            if not os.path.exists('result'):
                os.mkdir('result')
            result_df.to_csv(os.path.join('result',
                                          'result_{}epoch.csv'.format(i)),
                             index=False,
                             encoding='cp949')

            self.metrics['train']['loss'].append(train_loss)
            self.metrics['train']['score'].append(train_score.mean())
            self.metrics['val']['loss'].append(val_loss)
            self.metrics['val']['score'].append(val_score.mean())
            self.metrics['test']['loss'].append(test_loss)
            self.metrics['test']['score'].append(test_score.mean())

            self.predicts['train']['x_pred'].append(train_pred)
            self.predicts['val']['x_pred'].append(val_pred)
            self.predicts['test']['x_pred'].append(test_pred)
            self.predicts['problem']['x_pred'].append(problem_pred)

            plot_metrics(**self.metrics)
Ejemplo n.º 4
0
    def train(self,
              ds,
              BATCH_SIZE,
              EPOCH,
              feature_shuffle=False,
              train_all_data=False,
              verbose=True):

        # feature 순서 랜덤 셔플. 단, 마지막 feature는 swell_t-1으로 고정
        if feature_shuffle:
            p = np.random.permutation(ds['train']['x'].shape[2] - 1)
            ds['train']['x'][:, :, :len(p)] = ds['train']['x'][:, :, p]
            ds['val']['x'][:, :, :len(p)] = ds['val']['x'][:, :, p]
            ds['test']['x'][:, :, :len(p)] = ds['test']['x'][:, :, p]
            ds['problem']['x'][:, :, :len(p)] = ds['problem']['x'][:, :, p]

        # one hot
        ds['train']['y_onehot'] = to_one_hot(ds['train']['y'])
        ds['val']['y_onehot'] = to_one_hot(ds['val']['y'])
        ds['test']['y_onehot'] = to_one_hot(ds['test']['y'])

        # class weight
        for d0 in ['train', 'val', 'test']:
            # w0 = (ds[d0]['y'] == 0).sum() / len(ds[d0]['y'])
            # w1 = (ds[d0]['y'] == 1).sum() / len(ds[d0]['y'])
            # w2 = (ds[d0]['y'] == 2).sum() / len(ds[d0]['y'])
            # ds[d0]['w'] = np.array([1 / w0, 1 / w1, 1 / w2])[ds[d0]['y']]
            # ds[d0]['w'] = np.array([1 / 0.45, 1 / 0.36, 1 / 0.19])[ds[d0]['y']]
            diff_samples = ds[d0]['x'][:, -1, -1] != ds[d0]['y'][:, -1]
            ds[d0]['w'] = np.array([1, 2])[diff_samples.astype(int)]

        for i in range(EPOCH):

            print('[NAME: {}, EPOCH: {}]'.format(self.name, i))
            # Train
            train_loss, _ = self.run_batch(ds['train']['x'],
                                           ds['train']['y_onehot'],
                                           ds['train']['w'],
                                           BATCH_SIZE,
                                           is_training=True)

            # Validation
            if verbose:
                print('predict ONE Validation')
            val_loss, val_pred_one = self.run_batch(ds['val']['x'],
                                                    ds['val']['y_onehot'],
                                                    ds['val']['w'],
                                                    BATCH_SIZE,
                                                    is_training=train_all_data)
            val_acc_one, val_score_one, val_max_score = util.calc_metric(
                ds['val']['y'].ravel(),
                val_pred_one.round().astype(int).ravel(), self.n_class,
                verbose)

            if verbose:
                print('predict SEQ Validation')
            val_pred_seq, val_softmax_seq = self.predict_sequence(
                ds['val']['x'])
            val_pred_seq = val_pred_seq.round().astype(int).ravel()
            val_acc_seq, val_score_seq, val_max_score = util.calc_metric(
                ds['val']['y'].ravel(), val_pred_seq, self.n_class, verbose)

            # Test
            if verbose:
                print('predict ONE Test')
            test_loss, test_pred_one = self.run_batch(
                ds['test']['x'],
                ds['test']['y_onehot'],
                ds['test']['w'],
                BATCH_SIZE,
                is_training=train_all_data)
            test_acc_one, test_score_one, test_max_score = util.calc_metric(
                ds['test']['y'].ravel(),
                test_pred_one.ravel().round().astype(int), self.n_class,
                verbose)

            if verbose:
                print('predict SEQ Test')
            test_pred_seq, test_softmax_seq = self.predict_sequence(
                ds['test']['x'])
            test_pred_seq = test_pred_seq.round().astype(int).ravel()
            test_acc_seq, test_score_seq, test_max_score = util.calc_metric(
                ds['test']['y'].ravel(), test_pred_seq, self.n_class, verbose)

            print(
                "[SUMMARY]\n(Loss) train: {:.5} val: {:.5} test: {:.5}".format(
                    train_loss, val_loss, test_loss))
            print("val_acc_seq : {:.5} val_score_seq : {:.5} (max: {:.5})".
                  format(val_acc_seq, val_score_seq, val_max_score))
            print("test_acc_seq: {:.5} test_score_seq: {:.5} (max: {:.5})\n".
                  format(test_acc_seq, test_score_seq, test_max_score))

            # append current epoch's metrics
            self.metrics['train']['loss'].append(train_loss)
            self.metrics['val']['loss'].append(val_loss)
            self.metrics['test']['loss'].append(test_loss)
            self.metrics['val']['score_seq'].append(val_score_seq)
            self.metrics['test']['score_seq'].append(test_score_seq)

            if verbose:
                plot_metrics(**self.metrics)

            # predict Problem
            problem_pred, problem_softmax = self.predict_sequence(
                ds['problem']['x'])
            problem_pred = problem_pred.astype(int).ravel()

            # append current epoch's predictions
            self.predicts['val'].append(val_pred_seq)
            self.predicts['test'].append(test_pred_seq)
            self.predicts['problem'].append(problem_pred)
            self.predicts['val_softmax'].append(val_softmax_seq)
            self.predicts['test_softmax'].append(test_softmax_seq)
            self.predicts['problem_softmax'].append(problem_softmax)
        # END for i in range(EPOCH):

        return self.predicts