def run_predict_cv(self) -> None: """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う あらかじめrun_train_cvを実行しておく必要がある """ self.logger.info(f'{self.run_name} - start prediction cv') test_x = self.load_x_test() preds = [] # 各foldのモデルで予測を行う for i_fold in range(self.n_splits): self.logger.info(f'{self.run_name} - start prediction fold:{i_fold}') model = self.build_model(i_fold) model.load_model(self.out_dir_name) if self.metrics_name == 'RMSLE': pred = np.expm1(model.predict(test_x)) else: pred = model.predict(test_x) preds.append(pred) self.logger.info(f'{self.run_name} - end prediction fold:{i_fold}') # 予測の平均値を出力する if self.metrics_name == 'ACC': pred_avg = np.round(np.mean(preds, axis=0)) else: pred_avg = np.mean(preds, axis=0) # 推論結果の保存(submit対象データ) Util.dump_df_pickle(pd.DataFrame(pred_avg), self.out_dir_name + f'{self.run_name}_pred.pkl') self.logger.info(f'{self.run_name} - end prediction cv')
def run_predict_all(self) -> None: """学習データすべてで学習したモデルにより、テストデータの予測を行う あらかじめrun_train_allを実行しておく必要がある """ self.logger.info(f'{self.run_name} - start prediction all') test_x = self.load_x_test() # 学習データ全てで学習したモデルで予測を行う i_fold = 'all' model = self.build_model(i_fold) model.load_model(self.out_dir_name) pred = model.predict(test_x) # 予測結果の保存 Util.dump(pred, f'../model/pred/{self.run_name}-test.pkl') self.logger.info(f'{self.run_name} - end prediction all')
def train(self, tr_x, tr_y, va_x=None, va_y=None): # データのセット・スケーリング validation = va_x is not None self.one_hot_encoder = Util.load('one-hot-enc.pkl') tr_x = self.one_hot_encoder.transform(tr_x[self.categoricals]) scaler = StandardScaler() # scaler = MinMaxScaler() scaler.fit(tr_x) tr_x = scaler.transform(tr_x) if validation: va_x = self.one_hot_encoder.transform(va_x[self.categoricals]) va_x = scaler.transform(va_x) # パラメータ classes = self.params['classes'] layers = self.params['layers'] dropout = self.params['dropout'] units = self.params['units'] nb_epoch = self.params['nb_epoch'] patience = self.params['patience'] # モデルの構築 model = Sequential() model.add(Dense(units, input_shape=(tr_x.shape[1],))) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(dropout)) for l in range(layers - 1): units = int(units/2) model.add(Dense(units)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(dropout)) model.add(Dense(classes)) adam = optimizers.Adam(lr=1e-4) model.compile(optimizer=adam, loss="mean_absolute_error") if validation: early_stopping = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, restore_best_weights=True) save_best = ModelCheckpoint('nn_model.w8', save_weights_only=True, save_best_only=True, verbose=1) model.fit(tr_x, tr_y, epochs=nb_epoch, batch_size=128, verbose=2, validation_data=(va_x, va_y), callbacks=[save_best, early_stopping]) else: model.fit(tr_x, tr_y, nb_epoch=nb_epoch, batch_size=128, verbose=2) # モデル・スケーラーの保持 model.load_weights('nn_model.w8') self.model = model self.scaler = scaler
def load_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.model') self.model = Util.load(model_path)
def save_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Util.dump(self.model, model_path)
def calc_feature_importance(self, dir_name, run_name, features, n_splits, type='gain'): """feature importanceの計算 """ model_array = [] for i in range(n_splits): model_path = os.path.join(dir_name, f'{run_name}-fold{i}.model') model = Util.load(model_path) model_array.append(model) if type == 'gain': # gainの計算 val_gain = model_array[0].feature_importance( importance_type='gain') val_gain = pd.Series(val_gain) for m in model_array[1:]: s = pd.Series(m.feature_importance(importance_type='gain')) val_gain = pd.concat([val_gain, s], axis=1) if n_splits == 1: val_gain = val_gain.values df = pd.DataFrame(val_gain, index=features, columns=['importance' ]).sort_values('importance', ascending=False) df.to_csv(dir_name + run_name + '_importance_gain.csv') df = df.sort_values('importance', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance gain') ax1.set_xlabel('feature importance') ax1.barh(df.index, df['importance'], label='importance', align="center", alpha=0.6) # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # 各foldの平均を算出 val_mean = val_gain.mean(axis=1) val_mean = val_mean.values importance_df_mean = pd.DataFrame( val_mean, index=features, columns=['importance']).sort_values('importance') # 各foldの標準偏差を算出 val_std = val_gain.std(axis=1) val_std = val_std.values importance_df_std = pd.DataFrame( val_std, index=features, columns=['importance']).sort_values('importance') # マージ df = pd.merge(importance_df_mean, importance_df_std, left_index=True, right_index=True, suffixes=['_mean', '_std']) # 変動係数を算出 df['coef_of_var'] = df['importance_std'] / df['importance_mean'] df['coef_of_var'] = df['coef_of_var'].fillna(0) df = df.sort_values('importance_mean', ascending=False) df.to_csv(dir_name + run_name + '_importance_gain.csv') df = df.sort_values('importance_mean', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance gain') ax1.set_xlabel('feature importance mean & std') ax1.barh(df.index, df['importance_mean'], label='importance_mean', align="center", alpha=0.6) ax1.barh(df.index, df['importance_std'], label='importance_std', align="center", alpha=0.6) # 折れ線グラフを出力 ax2 = ax1.twiny() ax2.plot(df['coef_of_var'], df.index, linewidth=1, color="crimson", marker="o", markersize=8, label='coef_of_var') ax2.set_xlabel('Coefficient of variation') # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) ax2.legend(bbox_to_anchor=(1, 0.94), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) ax2.grid(False) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # splitの計算 val_split = self.model_array[0].feature_importance( importance_type='split') val_split = pd.Series(val_split) for m in model_array[1:]: s = pd.Series(m.feature_importance(importance_type='split')) val_split = pd.concat([val_split, s], axis=1) if n_splits == 1: val_split = val_split.values df = pd.DataFrame(val_split, index=features, columns=['importance' ]).sort_values('importance', ascending=False) df.to_csv(dir_name + run_name + '_importance_split.csv') df = df.sort_values('importance', ascending=True).tail(100) # 出力 fig, ax1 = plt.subplots(figsize=(10, 30)) plt.tick_params(labelsize=10) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance split') ax1.set_xlabel('feature importance') ax1.barh(df.index, df['importance'], label='importance', align="center", alpha=0.6) # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_gain.png', dpi=200, bbox_inches="tight") plt.close() else: # 各foldの平均を算出 val_mean = val_split.mean(axis=1) val_mean = val_mean.values importance_df_mean = pd.DataFrame( val_mean, index=features, columns=['importance']).sort_values('importance') # 各foldの標準偏差を算出 val_std = val_split.std(axis=1) val_std = val_std.values importance_df_std = pd.DataFrame( val_std, index=features, columns=['importance']).sort_values('importance') # マージ df = pd.merge(importance_df_mean, importance_df_std, left_index=True, right_index=True, suffixes=['_mean', '_std']) df['coef_of_var'] = df['importance_std'] / df['importance_mean'] df['coef_of_var'] = df['coef_of_var'].fillna(0) df = df.sort_values('importance_mean', ascending=True) # 出力 fig, ax1 = plt.subplots(figsize=(10, 90)) plt.tick_params(labelsize=8) # 図のラベルのfontサイズ # 棒グラフを出力 ax1.set_title('feature importance split') ax1.set_xlabel('feature importance mean & std') ax1.barh(df.index, df['importance_mean'], label='importance_mean', align="center", alpha=0.6) ax1.barh(df.index, df['importance_std'], label='importance_std', align="center", alpha=0.6) # 折れ線グラフを出力 ax2 = ax1.twiny() ax2.plot(df['coef_of_var'], df.index, linewidth=1, color="crimson", marker="o", markersize=8, label='coef_of_var') ax2.set_xlabel('Coefficient of variation') # 凡例を表示(グラフ左上、ax2をax1のやや下に持っていく) ax1.legend(bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0.5, fontsize=12) ax2.legend(bbox_to_anchor=(1, 0.94), loc='upper right', borderaxespad=0.5, fontsize=12) # グリッド表示(ax1のみ) ax1.grid(True) ax2.grid(False) plt.tight_layout() plt.savefig(dir_name + run_name + '_fi_split.png', dpi=300, bbox_inches="tight") plt.close()
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ self.logger.info(f'{self.run_name} - start training cv') if self.cv_method in ['KFold', 'TrainTestSplit', 'CustomTimeSeriesSplitter']: self.logger.info(f'{self.run_name} - cv method: {self.cv_method}') else: self.logger.info(f'{self.run_name} - cv method: {self.cv_method} - group: {self.cv_target_gr_column} - stratify: {self.cv_target_sf_column}') scores = [] # 各foldのscoreを保存 va_idxes = [] # 各foldのvalidationデータのindexを保存 preds = [] # 各foldの推論結果を保存 # 各foldで学習を行う for i_fold in range(self.n_splits): # 学習を行う self.logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) self.logger.info(f'{self.run_name} fold {i_fold} - end training - score {score}') # モデルを保存する model.save_model(self.out_dir_name) # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] # 全体のスコアを算出 if self.cv_method not in ['TrainTestSplit', 'CustomTimeSeriesSplitter']: if self.metrics_name == 'RMSLE': score_all_data = np.sqrt(self.metrics(np.expm1(self.train_y), preds)) else: score_all_data = self.metrics(self.train_y, preds) else: score_all_data = None # oofデータに対するfoldごとのscoreをcsvに書き込む(foldごとに分析する用) self.score_list.append(['score_all_data', score_all_data]) self.score_list.append(['score_fold_mean', np.mean(scores)]) for i in self.fold_score_list: self.score_list.append(i) with open(self.out_dir_name + f'{self.run_name}_score.csv', 'a') as f: writer = csv.writer(f) writer.writerows(self.score_list) # foldごとのスコアもmlflowでトラッキングする def score_mean(df): df = df.groupby('run_name').mean().round(4).reset_index().sort_values('run_name') return df _score_df = pd.read_csv(self.out_dir_name + f'{self.run_name}_score.csv') _score_df = score_mean(_score_df) _score_df = _score_df.T _score_df.columns = _score_df.iloc[0] _score_df = _score_df.drop(_score_df.index[0]) for col in _score_df.columns.tolist(): mlflow.log_metric(col, _score_df[col].values[0]) # 学習データでの予測結果の保存 if self.save_train_pred: Util.dump_df_pickle(pd.DataFrame(preds), self.out_dir_name + f'.{self.run_name}_train.pkl') # 評価結果の保存 self.logger.result_scores(self.run_name, scores, score_all_data) # shap feature importanceデータの保存 if self.calc_shap: self.shap_feature_importance()
def load_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.h5') scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl') self.model = load_model(model_path) self.scaler = Util.load(scaler_path) self.one_hot_encoder = Util.load('one-hot-enc.pkl')
def save_model(self, path): model_path = os.path.join(path, f'{self.run_fold_name}.h5') scaler_path = os.path.join(path, f'{self.run_fold_name}-scaler.pkl') os.makedirs(os.path.dirname(model_path), exist_ok=True) self.model.save(model_path) Util.dump(self.scaler, scaler_path)