Esempio n. 1
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None):

        # データのセット・スケーリング
        validation = va_x is not None

        self.one_hot_encoder = Util.load('one-hot-enc.pkl')
        tr_x = self.one_hot_encoder.transform(tr_x[self.categoricals])

        scaler = StandardScaler()
        # scaler = MinMaxScaler()
        scaler.fit(tr_x)
        tr_x = scaler.transform(tr_x)

        if validation:
            va_x = self.one_hot_encoder.transform(va_x[self.categoricals])
            va_x = scaler.transform(va_x)

        # パラメータ
        classes = self.params['classes']
        layers = self.params['layers']
        dropout = self.params['dropout']
        units = self.params['units']
        nb_epoch = self.params['nb_epoch']
        patience = self.params['patience']

        # モデルの構築
        model = Sequential()
        model.add(Dense(units, input_shape=(tr_x.shape[1],)))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(dropout))

        for l in range(layers - 1):
            units = int(units/2)
            model.add(Dense(units))
            model.add(PReLU())
            model.add(BatchNormalization())
            model.add(Dropout(dropout))

        model.add(Dense(classes))
        adam = optimizers.Adam(lr=1e-4)
        model.compile(optimizer=adam, loss="mean_absolute_error")

        if validation:
            early_stopping = EarlyStopping(monitor='val_loss', patience=patience,
                                            verbose=1, restore_best_weights=True)
            save_best = ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1)
            model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2,
                        validation_data=(va_x, va_y), callbacks=[save_best, early_stopping])
        else:
            model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2)

        # モデル・スケーラーの保持
        model.load_weights('nn_model.w8')
        self.model = model
        self.scaler = scaler
Esempio n. 2
0
 def load_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.model')
     self.model = Util.load(model_path)
Esempio n. 3
0
    def calc_feature_importance(self,
                                dir_name,
                                run_name,
                                features,
                                n_splits,
                                type='gain'):
        """feature importanceの計算
        """

        model_array = []
        for i in range(n_splits):
            model_path = os.path.join(dir_name, f'{run_name}-fold{i}.model')
            model = Util.load(model_path)
            model_array.append(model)

        if type == 'gain':
            # gainの計算
            val_gain = model_array[0].feature_importance(
                importance_type='gain')
            val_gain = pd.Series(val_gain)
            for m in model_array[1:]:
                s = pd.Series(m.feature_importance(importance_type='gain'))
                val_gain = pd.concat([val_gain, s], axis=1)

            if n_splits == 1:
                val_gain = val_gain.values
                df = pd.DataFrame(val_gain,
                                  index=features,
                                  columns=['importance'
                                           ]).sort_values('importance',
                                                          ascending=False)
                df.to_csv(dir_name + run_name + '_importance_gain.csv')
                df = df.sort_values('importance', ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance gain')
                ax1.set_xlabel('feature importance')
                ax1.barh(df.index,
                         df['importance'],
                         label='importance',
                         align="center",
                         alpha=0.6)

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

            else:
                # 各foldの平均を算出
                val_mean = val_gain.mean(axis=1)
                val_mean = val_mean.values
                importance_df_mean = pd.DataFrame(
                    val_mean, index=features,
                    columns=['importance']).sort_values('importance')

                # 各foldの標準偏差を算出
                val_std = val_gain.std(axis=1)
                val_std = val_std.values
                importance_df_std = pd.DataFrame(
                    val_std, index=features,
                    columns=['importance']).sort_values('importance')

                # マージ
                df = pd.merge(importance_df_mean,
                              importance_df_std,
                              left_index=True,
                              right_index=True,
                              suffixes=['_mean', '_std'])

                # 変動係数を算出
                df['coef_of_var'] = df['importance_std'] / df['importance_mean']
                df['coef_of_var'] = df['coef_of_var'].fillna(0)
                df = df.sort_values('importance_mean', ascending=False)
                df.to_csv(dir_name + run_name + '_importance_gain.csv')
                df = df.sort_values('importance_mean',
                                    ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance gain')
                ax1.set_xlabel('feature importance mean & std')
                ax1.barh(df.index,
                         df['importance_mean'],
                         label='importance_mean',
                         align="center",
                         alpha=0.6)
                ax1.barh(df.index,
                         df['importance_std'],
                         label='importance_std',
                         align="center",
                         alpha=0.6)

                # 折れ線グラフを出力
                ax2 = ax1.twiny()
                ax2.plot(df['coef_of_var'],
                         df.index,
                         linewidth=1,
                         color="crimson",
                         marker="o",
                         markersize=8,
                         label='coef_of_var')
                ax2.set_xlabel('Coefficient of variation')

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)
                ax2.legend(bbox_to_anchor=(1, 0.94),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)
                ax2.grid(False)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

        else:
            # splitの計算
            val_split = self.model_array[0].feature_importance(
                importance_type='split')
            val_split = pd.Series(val_split)
            for m in model_array[1:]:
                s = pd.Series(m.feature_importance(importance_type='split'))
                val_split = pd.concat([val_split, s], axis=1)

            if n_splits == 1:

                val_split = val_split.values
                df = pd.DataFrame(val_split,
                                  index=features,
                                  columns=['importance'
                                           ]).sort_values('importance',
                                                          ascending=False)
                df.to_csv(dir_name + run_name + '_importance_split.csv')
                df = df.sort_values('importance', ascending=True).tail(100)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 30))
                plt.tick_params(labelsize=10)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance split')
                ax1.set_xlabel('feature importance')
                ax1.barh(df.index,
                         df['importance'],
                         label='importance',
                         align="center",
                         alpha=0.6)

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_gain.png',
                            dpi=200,
                            bbox_inches="tight")
                plt.close()

            else:

                # 各foldの平均を算出
                val_mean = val_split.mean(axis=1)
                val_mean = val_mean.values
                importance_df_mean = pd.DataFrame(
                    val_mean, index=features,
                    columns=['importance']).sort_values('importance')

                # 各foldの標準偏差を算出
                val_std = val_split.std(axis=1)
                val_std = val_std.values
                importance_df_std = pd.DataFrame(
                    val_std, index=features,
                    columns=['importance']).sort_values('importance')

                # マージ
                df = pd.merge(importance_df_mean,
                              importance_df_std,
                              left_index=True,
                              right_index=True,
                              suffixes=['_mean', '_std'])

                df['coef_of_var'] = df['importance_std'] / df['importance_mean']
                df['coef_of_var'] = df['coef_of_var'].fillna(0)
                df = df.sort_values('importance_mean', ascending=True)

                # 出力
                fig, ax1 = plt.subplots(figsize=(10, 90))
                plt.tick_params(labelsize=8)  # 図のラベルのfontサイズ

                # 棒グラフを出力
                ax1.set_title('feature importance split')
                ax1.set_xlabel('feature importance mean & std')
                ax1.barh(df.index,
                         df['importance_mean'],
                         label='importance_mean',
                         align="center",
                         alpha=0.6)
                ax1.barh(df.index,
                         df['importance_std'],
                         label='importance_std',
                         align="center",
                         alpha=0.6)

                # 折れ線グラフを出力
                ax2 = ax1.twiny()
                ax2.plot(df['coef_of_var'],
                         df.index,
                         linewidth=1,
                         color="crimson",
                         marker="o",
                         markersize=8,
                         label='coef_of_var')
                ax2.set_xlabel('Coefficient of variation')

                # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく)
                ax1.legend(bbox_to_anchor=(1, 1),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)
                ax2.legend(bbox_to_anchor=(1, 0.94),
                           loc='upper right',
                           borderaxespad=0.5,
                           fontsize=12)

                # グリッド表示(ax1のみ)
                ax1.grid(True)
                ax2.grid(False)

                plt.tight_layout()
                plt.savefig(dir_name + run_name + '_fi_split.png',
                            dpi=300,
                            bbox_inches="tight")
                plt.close()
Esempio n. 4
0
 def load_model(self, path):
     model_path = os.path.join(path, f'{self.run_fold_name}.h5')
     scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl')
     self.model = load_model(model_path)
     self.scaler = Util.load(scaler_path)
     self.one_hot_encoder = Util.load('one-hot-enc.pkl')