Exemple #1
0
    def run_predict_cv(self) -> None:
        """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う
        あらかじめrun_train_cvを実行しておく必要がある
        """
        self.logger.info(f'{self.run_name} - start prediction cv')
        test_x = self.load_x_test()
        preds = []

        # 各foldのモデルで予測を行う
        for i_fold in range(self.n_splits):
            self.logger.info(f'{self.run_name} - start prediction fold:{i_fold}')
            model = self.build_model(i_fold)
            model.load_model(self.out_dir_name)
            if self.metrics_name == 'RMSLE':
                pred = np.expm1(model.predict(test_x))
            else:
                pred = model.predict(test_x)
            preds.append(pred)
            self.logger.info(f'{self.run_name} - end prediction fold:{i_fold}')

        # 予測の平均値を出力する
        if self.metrics_name == 'ACC':
            pred_avg = np.round(np.mean(preds, axis=0))
        else:
            pred_avg = np.mean(preds, axis=0)

        # 推論結果の保存(submit対象データ)
        Util.dump_df_pickle(pd.DataFrame(pred_avg), self.out_dir_name + f'{self.run_name}_pred.pkl')

        self.logger.info(f'{self.run_name} - end prediction cv')
Exemple #2
0
    def run_predict_all(self) -> None:
        """学習データすべてで学習したモデルにより、テストデータの予測を行う
        あらかじめrun_train_allを実行しておく必要がある
        """
        self.logger.info(f'{self.run_name} - start prediction all')

        test_x = self.load_x_test()

        # 学習データ全てで学習したモデルで予測を行う
        i_fold = 'all'
        model = self.build_model(i_fold)
        model.load_model(self.out_dir_name)
        pred = model.predict(test_x)

        # 予測結果の保存
        Util.dump(pred, f'../model/pred/{self.run_name}-test.pkl')

        self.logger.info(f'{self.run_name} - end prediction all')
Exemple #3
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None):

        # データのセット・スケーリング
        validation = va_x is not None

        self.one_hot_encoder = Util.load('one-hot-enc.pkl')
        tr_x = self.one_hot_encoder.transform(tr_x[self.categoricals])

        scaler = StandardScaler()
        # scaler = MinMaxScaler()
        scaler.fit(tr_x)
        tr_x = scaler.transform(tr_x)

        if validation:
            va_x = self.one_hot_encoder.transform(va_x[self.categoricals])
            va_x = scaler.transform(va_x)

        # パラメータ
        classes = self.params['classes']
        layers = self.params['layers']
        dropout = self.params['dropout']
        units = self.params['units']
        nb_epoch = self.params['nb_epoch']
        patience = self.params['patience']

        # モデルの構築
        model = Sequential()
        model.add(Dense(units, input_shape=(tr_x.shape[1],)))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(dropout))

        for l in range(layers - 1):
            units = int(units/2)
            model.add(Dense(units))
            model.add(PReLU())
            model.add(BatchNormalization())
            model.add(Dropout(dropout))

        model.add(Dense(classes))
        adam = optimizers.Adam(lr=1e-4)
        model.compile(optimizer=adam, loss="mean_absolute_error")

        if validation:
            early_stopping = EarlyStopping(monitor='val_loss', patience=patience,
                                            verbose=1, restore_best_weights=True)
            save_best = ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
            model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2,
                        validation_data=(va_x, va_y), callbacks=[save_best, early_stopping])
        else:
            model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2)

        # モデル・スケーラーの保持
        model.load_weights('nn_model.w8')
        self.model = model
        self.scaler = scaler
Exemple #4
0
 def load_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.model')
     self.model = Util.load(model_path)
Exemple #5
0
 def save_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.model')
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     Util.dump(self.model, model_path)
Exemple #6
0
    def calc_feature_importance(self,
                                dir_name,
                                run_name,
                                features,
                                n_splits,
                                type='gain'):
        """feature importanceの計算
        """

        model_array = []
        for i in range(n_splits):
            model_path = os.path.join(dir_name, f'{run_name}-fold{i}.model')
            model = Util.load(model_path)
            model_array.append(model)

        if type == 'gain':
            # gainの計算
            val_gain = model_array[0].feature_importance(
                importance_type='gain')
            val_gain = pd.Series(val_gain)
            for m in model_array[1:]:
                s = pd.Series(m.feature_importance(importance_type='gain'))
                val_gain = pd.concat([val_gain, s], axis=1)

            if n_splits == 1:
                val_gain = val_gain.values
                df = pd.DataFrame(val_gain,
                                  index=features,
                                  columns=['importance'
                                           ]).sort_values('importance',
                                                          ascending=False)
                df.to_csv(dir_name + run_name + '_importance_gain.csv')
                df = df.sort_values('importance', ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance gain')
                ax1.set_xlabel('feature importance')
                ax1.barh(df.index,
                         df['importance'],
                         label='importance',
                         align="center",
                         alpha=0.6)

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

            else:
                # 各foldの平均を算出
                val_mean = val_gain.mean(axis=1)
                val_mean = val_mean.values
                importance_df_mean = pd.DataFrame(
                    val_mean, index=features,
                    columns=['importance']).sort_values('importance')

                # 各foldの標準偏差を算出
                val_std = val_gain.std(axis=1)
                val_std = val_std.values
                importance_df_std = pd.DataFrame(
                    val_std, index=features,
                    columns=['importance']).sort_values('importance')

                # マージ
                df = pd.merge(importance_df_mean,
                              importance_df_std,
                              left_index=True,
                              right_index=True,
                              suffixes=['_mean', '_std'])

                # 変動係数を算出
                df['coef_of_var'] = df['importance_std'] / df['importance_mean']
                df['coef_of_var'] = df['coef_of_var'].fillna(0)
                df = df.sort_values('importance_mean', ascending=False)
                df.to_csv(dir_name + run_name + '_importance_gain.csv')
                df = df.sort_values('importance_mean',
                                    ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance gain')
                ax1.set_xlabel('feature importance mean & std')
                ax1.barh(df.index,
                         df['importance_mean'],
                         label='importance_mean',
                         align="center",
                         alpha=0.6)
                ax1.barh(df.index,
                         df['importance_std'],
                         label='importance_std',
                         align="center",
                         alpha=0.6)

                # 折れ線グラフを出力
                ax2 = ax1.twiny()
                ax2.plot(df['coef_of_var'],
                         df.index,
                         linewidth=1,
                         color="crimson",
                         marker="o",
                         markersize=8,
                         label='coef_of_var')
                ax2.set_xlabel('Coefficient of variation')

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)
                ax2.legend(bbox_to_anchor=(1, 0.94),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)
                ax2.grid(False)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

        else:
            # splitの計算
            val_split = self.model_array[0].feature_importance(
                importance_type='split')
            val_split = pd.Series(val_split)
            for m in model_array[1:]:
                s = pd.Series(m.feature_importance(importance_type='split'))
                val_split = pd.concat([val_split, s], axis=1)

            if n_splits == 1:

                val_split = val_split.values
                df = pd.DataFrame(val_split,
                                  index=features,
                                  columns=['importance'
                                           ]).sort_values('importance',
                                                          ascending=False)
                df.to_csv(dir_name + run_name + '_importance_split.csv')
                df = df.sort_values('importance', ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance split')
                ax1.set_xlabel('feature importance')
                ax1.barh(df.index,
                         df['importance'],
                         label='importance',
                         align="center",
                         alpha=0.6)

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

            else:

                # 各foldの平均を算出
                val_mean = val_split.mean(axis=1)
                val_mean = val_mean.values
                importance_df_mean = pd.DataFrame(
                    val_mean, index=features,
                    columns=['importance']).sort_values('importance')

                # 各foldの標準偏差を算出
                val_std = val_split.std(axis=1)
                val_std = val_std.values
                importance_df_std = pd.DataFrame(
                    val_std, index=features,
                    columns=['importance']).sort_values('importance')

                # マージ
                df = pd.merge(importance_df_mean,
                              importance_df_std,
                              left_index=True,
                              right_index=True,
                              suffixes=['_mean', '_std'])

                df['coef_of_var'] = df['importance_std'] / df['importance_mean']
                df['coef_of_var'] = df['coef_of_var'].fillna(0)
                df = df.sort_values('importance_mean', ascending=True)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 90))
                plt.tick_params(labelsize=8)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance split')
                ax1.set_xlabel('feature importance mean & std')
                ax1.barh(df.index,
                         df['importance_mean'],
                         label='importance_mean',
                         align="center",
                         alpha=0.6)
                ax1.barh(df.index,
                         df['importance_std'],
                         label='importance_std',
                         align="center",
                         alpha=0.6)

                # 折れ線グラフを出力
                ax2 = ax1.twiny()
                ax2.plot(df['coef_of_var'],
                         df.index,
                         linewidth=1,
                         color="crimson",
                         marker="o",
                         markersize=8,
                         label='coef_of_var')
                ax2.set_xlabel('Coefficient of variation')

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)
                ax2.legend(bbox_to_anchor=(1, 0.94),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)
                ax2.grid(False)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_split.png',
                            dpi=300,
                            bbox_inches="tight")
                plt.close()
Exemple #7
0
    def run_train_cv(self) -> None:
        """クロスバリデーションでの学習・評価を行う
        学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う
        """
        self.logger.info(f'{self.run_name} - start training cv')
        if self.cv_method in ['KFold', 'TrainTestSplit', 'CustomTimeSeriesSplitter']:
            self.logger.info(f'{self.run_name} - cv method: {self.cv_method}')
        else:
            self.logger.info(f'{self.run_name} - cv method: {self.cv_method} - group: {self.cv_target_gr_column} - stratify: {self.cv_target_sf_column}')

        scores = []  # 各foldのscoreを保存
        va_idxes = []  # 各foldのvalidationデータのindexを保存
        preds = []  # 各foldの推論結果を保存

        # 各foldで学習を行う
        for i_fold in range(self.n_splits):
            # 学習を行う
            self.logger.info(f'{self.run_name} fold {i_fold} - start training')
            model, va_idx, va_pred, score = self.train_fold(i_fold)
            self.logger.info(f'{self.run_name} fold {i_fold} - end training - score {score}')

            # モデルを保存する
            model.save_model(self.out_dir_name)

            # 結果を保持する
            va_idxes.append(va_idx)
            scores.append(score)
            preds.append(va_pred)

        # 各foldの結果をまとめる
        va_idxes = np.concatenate(va_idxes)
        order = np.argsort(va_idxes)
        preds = np.concatenate(preds, axis=0)
        preds = preds[order]

        # 全体のスコアを算出
        if self.cv_method not in ['TrainTestSplit', 'CustomTimeSeriesSplitter']:
            if self.metrics_name == 'RMSLE':
                score_all_data = np.sqrt(self.metrics(np.expm1(self.train_y), preds))
            else:
                score_all_data = self.metrics(self.train_y, preds)
        else:
            score_all_data = None

        # oofデータに対するfoldごとのscoreをcsvに書き込む(foldごとに分析する用)
        self.score_list.append(['score_all_data', score_all_data])
        self.score_list.append(['score_fold_mean', np.mean(scores)])
        for i in self.fold_score_list:
            self.score_list.append(i)
        with open(self.out_dir_name + f'{self.run_name}_score.csv', 'a') as f:
            writer = csv.writer(f)
            writer.writerows(self.score_list)

        # foldごとのスコアもmlflowでトラッキングする
        def score_mean(df):
            df = df.groupby('run_name').mean().round(4).reset_index().sort_values('run_name')
            return df
        _score_df = pd.read_csv(self.out_dir_name + f'{self.run_name}_score.csv')
        _score_df = score_mean(_score_df)
        _score_df = _score_df.T
        _score_df.columns = _score_df.iloc[0]
        _score_df = _score_df.drop(_score_df.index[0])
        for col in _score_df.columns.tolist():
            mlflow.log_metric(col, _score_df[col].values[0])

        # 学習データでの予測結果の保存
        if self.save_train_pred:
            Util.dump_df_pickle(pd.DataFrame(preds), self.out_dir_name + f'.{self.run_name}_train.pkl')

        # 評価結果の保存
        self.logger.result_scores(self.run_name, scores, score_all_data)

        # shap feature importanceデータの保存
        if self.calc_shap:
            self.shap_feature_importance()
Exemple #8
0
 def load_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.h5')
     scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl')
     self.model = load_model(model_path)
     self.scaler = Util.load(scaler_path)
     self.one_hot_encoder = Util.load('one-hot-enc.pkl')
Exemple #9
0
 def save_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.h5')
     scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl')
     os.makedirs(os.path.dirname(model_path), exist_ok=True)
     self.model.save(model_path)
     Util.dump(self.scaler, scaler_path)