def train(self, tr_x, tr_y, va_x=None, va_y=None): # データのセット・スケーリング validation = va_x is not None self.one_hot_encoder = Util.load('one-hot-enc.pkl') tr_x = self.one_hot_encoder.transform(tr_x[self.categoricals]) scaler = StandardScaler() # scaler = MinMaxScaler() scaler.fit(tr_x) tr_x = scaler.transform(tr_x) if validation: va_x = self.one_hot_encoder.transform(va_x[self.categoricals]) va_x = scaler.transform(va_x) # パラメータ classes = self.params['classes'] layers = self.params['layers'] dropout = self.params['dropout'] units = self.params['units'] nb_epoch = self.params['nb_epoch'] patience = self.params['patience'] # モデルの構築 model = Sequential() model.add(Dense(units, input_shape=(tr_x.shape[1],))) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(dropout)) for l in range(layers - 1): units = int(units/2) model.add(Dense(units)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(dropout)) model.add(Dense(classes)) adam = optimizers.Adam(lr=1e-4) model.compile(optimizer=adam, loss="mean_absolute_error") if validation: early_stopping = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True) save_best = ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1) model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2, validation_data=(va_x, va_y), callbacks=[save_best, early_stopping]) else: model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2) # モデル・スケーラーの保持 model.load_weights('nn_model.w8') self.model = model self.scaler = scaler
def load_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.model') self.model = Util.load(model_path)
def calc_feature_importance(self, dir_name, run_name, features, n_splits, type='gain'): """feature importanceの計算 """ model_array = [] for i in range(n_splits): model_path = os.path.join(dir_name, f'{run_name}-fold{i}.model') model = Util.load(model_path) model_array.append(model) if type == 'gain': # gainの計算 val_gain = model_array[0].feature_importance( importance_type='gain') val_gain = pd.Series(val_gain) for m in model_array[1:]: s = pd.Series(m.feature_importance(importance_type='gain')) val_gain = pd.concat([val_gain, s], axis=1) if n_splits == 1: val_gain = val_gain.values df = pd.DataFrame(val_gain, index=features, columns=['importance' ]).sort_values('importance', ascending=False) df.to_csv(dir_name + run_name + '_importance_gain.csv') df = df.sort_values('importance', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance gain') ax1.set_xlabel('feature importance') ax1.barh(df.index, df['importance'], label='importance', align="center", alpha=0.6) # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # 各foldの平均を算出 val_mean = val_gain.mean(axis=1) val_mean = val_mean.values importance_df_mean = pd.DataFrame( val_mean, index=features, columns=['importance']).sort_values('importance') # 各foldの標準偏差を算出 val_std = val_gain.std(axis=1) val_std = val_std.values importance_df_std = pd.DataFrame( val_std, index=features, columns=['importance']).sort_values('importance') # マージ df = pd.merge(importance_df_mean, importance_df_std, left_index=True, right_index=True, suffixes=['_mean', '_std']) # 変動係数を算出 df['coef_of_var'] = df['importance_std'] / df['importance_mean'] df['coef_of_var'] = df['coef_of_var'].fillna(0) df = df.sort_values('importance_mean', ascending=False) df.to_csv(dir_name + run_name + '_importance_gain.csv') df = df.sort_values('importance_mean', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance gain') ax1.set_xlabel('feature importance mean & std') ax1.barh(df.index, df['importance_mean'], label='importance_mean', align="center", alpha=0.6) ax1.barh(df.index, df['importance_std'], label='importance_std', align="center", alpha=0.6) # 折れ線グラフを出力 ax2 = ax1.twiny() ax2.plot(df['coef_of_var'], df.index, linewidth=1, color="crimson", marker="o", markersize=8, label='coef_of_var') ax2.set_xlabel('Coefficient of variation') # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) ax2.legend(bbox_to_anchor=(1, 0.94), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) ax2.grid(False) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # splitの計算 val_split = self.model_array[0].feature_importance( importance_type='split') val_split = pd.Series(val_split) for m in model_array[1:]: s = pd.Series(m.feature_importance(importance_type='split')) val_split = pd.concat([val_split, s], axis=1) if n_splits == 1: val_split = val_split.values df = pd.DataFrame(val_split, index=features, columns=['importance' ]).sort_values('importance', ascending=False) df.to_csv(dir_name + run_name + '_importance_split.csv') df = df.sort_values('importance', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance split') ax1.set_xlabel('feature importance') ax1.barh(df.index, df['importance'], label='importance', align="center", alpha=0.6) # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # 各foldの平均を算出 val_mean = val_split.mean(axis=1) val_mean = val_mean.values importance_df_mean = pd.DataFrame( val_mean, index=features, columns=['importance']).sort_values('importance') # 各foldの標準偏差を算出 val_std = val_split.std(axis=1) val_std = val_std.values importance_df_std = pd.DataFrame( val_std, index=features, columns=['importance']).sort_values('importance') # マージ df = pd.merge(importance_df_mean, importance_df_std, left_index=True, right_index=True, suffixes=['_mean', '_std']) df['coef_of_var'] = df['importance_std'] / df['importance_mean'] df['coef_of_var'] = df['coef_of_var'].fillna(0) df = df.sort_values('importance_mean', ascending=True) # 出力 fig, ax1 = plt.subplots(figsize=(10, 90)) plt.tick_params(labelsize=8) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance split') ax1.set_xlabel('feature importance mean & std') ax1.barh(df.index, df['importance_mean'], label='importance_mean', align="center", alpha=0.6) ax1.barh(df.index, df['importance_std'], label='importance_std', align="center", alpha=0.6) # 折れ線グラフを出力 ax2 = ax1.twiny() ax2.plot(df['coef_of_var'], df.index, linewidth=1, color="crimson", marker="o", markersize=8, label='coef_of_var') ax2.set_xlabel('Coefficient of variation') # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) ax2.legend(bbox_to_anchor=(1, 0.94), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) ax2.grid(False) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_split.png', dpi=300, bbox_inches="tight") plt.close()
def load_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.h5') scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl') self.model = load_model(model_path) self.scaler = Util.load(scaler_path) self.one_hot_encoder = Util.load('one-hot-enc.pkl')