def run_predict_cv(self) -> None: """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う あらかじめrun_train_cvを実行しておく必要がある """ logger.info(f'{self.run_name} - start prediction cv') X_test = self.X_test preds = [] show_feature_importance = 'LGBM' in str(self.model_cls) if show_feature_importance: feature_importances = pd.DataFrame() # 各foldのモデルで予測を行う for i_fold in range(self.cv.n_splits): logger.info(f'{self.run_name} - start prediction fold:{i_fold}') model = self.build_model(i_fold) model.load_model() pred = model.predict(X_test) preds.append(pred) logger.info(f'{self.run_name} - end prediction fold:{i_fold}') if show_feature_importance: feature_importances = pd.concat( [feature_importances, model.feature_importance(X_test)], axis=0) # 予測の平均値を出力する pred_avg = np.mean(preds, axis=0) # 予測結果の保存 Data.dump(pred_avg, f'../output/pred/{self.run_name}-test.pkl') logger.info(f'{self.run_name} - end prediction cv') # 特徴量の重要度 if show_feature_importance: aggs = feature_importances.groupby('Feature').mean().sort_values( by="importance", ascending=False) cols = aggs[:200].index pd.DataFrame(aggs.index).to_csv( f'../output/importance/{self.run_name}-fi.csv', index=False) best_features = feature_importances.loc[ feature_importances.Feature.isin(cols)] plt.figure(figsize=(14, 26)) sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (averaged over folds)') plt.tight_layout() plt.savefig(f'../output/importance/{self.run_name}-fi.png') plt.show() # mlflow mlflow.start_run(run_id=self.run_id) log_artifact(f'../output/importance/{self.run_name}-fi.png') mlflow.end_run()
def make_predictions(data: list, weights: list): pred = 0 for i, d in enumerate(data): if i < len(weights): pred += d[1] * weights[i] else: pred += d[1] * (1 - sum(weights)) Data.dump(pred, f'../output/pred/{run_name}-test.pkl') return pred
def f(x): pred = 0 for i, d in enumerate(data): if i < len(x): pred += d[0] * x[i] else: pred += d[0] * (1 - sum(x)) score = -1 * roc_auc_score(y_train, pred) Data.dump(pred, f'../output/pred/{run_name}-train.pkl') return score
def f(x): pred = 0 for i, d in enumerate(data): if i < len(x): pred += d[0] * x[i] else: pred += d[0] * (1 - sum(x)) score = np.sqrt(mean_squared_error(y_train, pred)) Data.dump(pred, f'../output/pred/{run_name}-train.pkl') return score
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # 各foldで学習を行う for i_fold in range(self.n_fold): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] logger.info( f'{self.run_name} - end training cv - score {np.mean(scores)}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # 評価結果の保存 logger.result_scores(self.run_name, scores)
def save_as_pickle( train: pd.DataFrame, test: pd.DataFrame, target_col: str, exp_id: str, output_dir: str = '../input') -> Tuple[pd.DataFrame, pd.DataFrame]: """Save X_train, X_test and y_train as pickel format Args: train (pd.DataFrame): train test (pd.DataFrame): test target_col (str): target column exp_id (str): experiment id output_dir (str, optional): output directory. Defaults to '../input'. Returns: Tuple[pd.DataFrame, pd.DataFrame]: train, test """ X_train = train.drop(target_col, axis=1) y_train = train[target_col] if target_col in test.columns: X_test = test.drop(target_col, axis=1) else: X_test = test Data.dump(X_train, join(output_dir, f"X_train_{exp_id}.pkl")) Data.dump(y_train, join(output_dir, f"y_train_{exp_id}.pkl")) Data.dump(X_test, join(output_dir, f"X_test_{exp_id}.pkl"))
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame, col_definition: dict, option: dict): """ col_definition: target_col option: exp_id """ X_train = train.drop(col_definition['target_col'], axis=1) y_train = train[col_definition['target_col']] if col_definition['target_col'] in test.columns: X_test = test.drop(col_definition['target_col'], axis=1) else: X_test = test Data.dump(X_train, join('../input', f"X_train{option['exp_id']}.pkl")) Data.dump(y_train, join('../input', 'y_train.pkl')) Data.dump(X_test, join('../input', f"X_test{option['exp_id']}.pkl"))
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame, col_definition: dict, option: dict): """ col_definition: target_col option: output_dir, exp_id """ X_train = train.drop(col_definition['target_col'], axis=1) y_train = train[col_definition['target_col']] if col_definition['target_col'] in test.columns: X_test = test.drop(col_definition['target_col'], axis=1) else: X_test = test Data.dump(X_train, join(option['output_dir'], f"X_train_{option['exp_id']}.pkl")) Data.dump(y_train, join(option['output_dir'], f"y_train_{option['exp_id']}.pkl")) Data.dump(X_test, join(option['output_dir'], f"X_test_{option['exp_id']}.pkl"))
def save_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) # best_ntree_limitが消えるのを防ぐため、pickleで保存することとした Data.dump(self.model, model_path)
'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, ], nunique_dict=[ { 'key': ['Sex'], 'var': ['SibSp'], 'agg': ['nunique'] }, { 'key': ['Sex'], 'var': ['Cabin'], 'agg': ['nunique'] }, ]) print(X_train.shape, X_test.shape) unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( X_train, X_test, escape_col=categorical_cols, threshold=0.99) X_train.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) X_test.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) print(X_train.shape, X_test.shape) Data.dump(X_train, output_dir + 'X_train_fe000.pkl') Data.dump(X_test, output_dir + 'X_test_fe000.pkl') Data.dump(y_train, output_dir + 'y_train_fe000.pkl')
import pandas as pd from ayniy.utils import Data INPUT_DIR = "../input/data_v2/" if __name__ == "__main__": train = pd.read_csv(INPUT_DIR + "train.csv") test = pd.read_csv(INPUT_DIR + "test.csv") Data.dump(train.drop("score", axis=1), "../input/X_train_fe000.pkl") Data.dump(train["score"], "../input/y_train_fe000.pkl") Data.dump(test, "../input/X_test_fe000.pkl")
def save_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path)
def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path)
fe_id = "fe000" fe_name = f"{fe_id}_nn_small" X_train = Data.load(f"../input/pickle/X_train_{fe_id}.pkl") y_train = Data.load(f"../input/pickle/y_train_{fe_id}.pkl") X_test = Data.load(f"../input/pickle/X_test_{fe_id}.pkl") del_col = [] for c in X_train.columns: X_train[c].fillna(-1, inplace=True) X_test[c].fillna(-1, inplace=True) try: prep = StandardScaler() X_train[c] = prep.fit_transform(X_train[[c]]) X_test[c] = prep.transform(X_test[[c]]) except ValueError: del_col.append(c) print(del_col) print(len(del_col)) X_train.drop(del_col, axis=1, inplace=True) X_test.drop(del_col, axis=1, inplace=True) print(X_train.shape) X_train = X_train.loc[:100] y_train = y_train.loc[:100] Data.dump(X_train, f"../input/pickle/X_train_{fe_name}.pkl") Data.dump(y_train, f"../input/pickle/y_train_{fe_name}.pkl") Data.dump(X_test, f"../input/pickle/X_test_{fe_name}.pkl")
fe005_tr = Data.load('../input/X_train_fe005.pkl') fe005_te = Data.load('../input/X_test_fe005.pkl') # fe001_tr = Data.load('../input/X_train_fe001.pkl') # fe001_te = Data.load('../input/X_test_fe001.pkl') # top10_tr = Data.load('../input/X_train_fe004_top10.pkl') # top10_te = Data.load('../input/X_test_fe004_top10.pkl') # top10_tr, top10_te = standerize(top10_tr, top10_te, {'encode_col': top10_tr.columns}) # print(top10_tr.head()) train_fitting_ef_add = pd.concat([fe005_tr, add_tr], axis=1) test_fitting_ef_add = pd.concat([fe005_te, add_te], axis=1) fe_name = 'fe005_add' Data.dump(train_fitting_ef_add, f'../input/X_train_{fe_name}.pkl') Data.dump(test_fitting_ef_add, f'../input/X_test_{fe_name}.pkl') # train_fitting_ef.to_csv('../input/train_fitting_ef.csv', index=False) # test_fitting_ef.to_csv('../input/test_fitting_ef.csv', index=False) # spec_train = [] # for i, filename in enumerate(train['spectrum_filename'].values): # spec_df = pd.read_csv(f'../input/spectrum_raw/{filename}', sep='\t', header=None) # spec_train.append(spec_df[1]) # spec_tr = pd.concat(spec_train, axis=1) # spec_tr = spec_tr.T # spec_tr.columns = [f'spec_{i}' for i in range(spec_tr.shape[1])] # spec_tr = spec_tr.reset_index(drop=True)
if __name__ == "__main__": train = pd.read_csv(INPUT_DIR + "train.csv") train = split_user_id(train) train = add_user_ages(train) train = add_user_purposes(train) train = add_user_vecs(train) train = add_user_strengths(train) train = add_user_works(train) train = add_user_skills(train) train = add_user_educations(train) pd.Series(train.columns).to_csv("../input/col_names.csv", index=False) Data.dump( train.drop(DELETE_COLS + ["score"], axis=1), "../input/X_train_fe002.pkl", ) Data.dump(train["score"], "../input/y_train_fe002.pkl") del train gc.collect() test = pd.read_csv(INPUT_DIR + "test.csv") test = split_user_id(test) test = add_user_ages(test) test = add_user_purposes(test) test = add_user_vecs(test) test = add_user_strengths(test) test = add_user_works(test) test = add_user_skills(test) test = add_user_educations(test)
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f"{self.run_name} - start training cv") scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and "adversarial_validation" in self.advanced: X_train = self.X_train X_test = self.X_test X_train["target"] = 0 X_test["target"] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train["target"] X_train.drop("target", axis=1, inplace=True) X_test.drop("target", axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f"{self.run_name} fold {i_fold} - start training") model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f"{self.run_name} fold {i_fold} - end training - score {score}" ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == "log_loss": cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == "mean_absolute_error": cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == "rmse": cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == "auc": cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == "prauc": cv_score = average_precision_score(self.y_train, preds) logger.info(f"{self.run_name} - end training cv - score {cv_score}") # 予測結果の保存 Data.dump(preds, f"../output/pred/{self.run_name}-train.pkl") # mlflow self.run_id = mlflow.active_run().info.run_id log_param("model_name", self.model_cls.__class__.__name__) log_param("fe_name", self.fe_name) log_param("train_params", self.params) log_param("cv_strategy", str(self.cv)) log_param("evaluation_metric", self.evaluation_metric) log_metric("cv_score", cv_score) log_param( "fold_scores", dict( zip([f"fold_{i}" for i in range(len(scores))], [round(s, 4) for s in scores])), ) log_param("cols_definition", self.cols_definition) log_param("description", self.description) mlflow.end_run()
import pandas as pd from ayniy.utils import Data if __name__ == '__main__': ef_tr = pd.read_csv('../input/efficient_tr.csv') ef_te = pd.read_csv('../input/efficient_te.csv') fe001_top500_tr = Data.load('../input/X_train_fe001_top500.pkl') fe001_top500_te = Data.load('../input/X_test_fe001_top500.pkl') train_tag = pd.concat([fe001_top500_tr, ef_tr], axis=1) test_tag = pd.concat([fe001_top500_te, ef_te], axis=1) fe_name = 'fe001_top500_ef' Data.dump(train_tag, f'../input/X_train_{fe_name}.pkl') Data.dump(test_tag, f'../input/X_test_{fe_name}.pkl')
lbl = OrdinalEncoder(dtype='int') train[col] = lbl.fit_transform(train[col].astype('str').fillna('-1').values.reshape(-1, 1)) test[col] = lbl.transform(test[col].astype('str').fillna('-1').values.reshape(-1, 1)) temp = pd.concat([train[[col]], test[[col]]], axis=0) temp_mapping = temp.groupby(col).size() / len(temp) temp['enc'] = temp[col].map(temp_mapping) temp['enc'] = stats.rankdata(temp['enc']) temp = temp.reset_index(drop=True) train[f'rank_frqenc_{col}'] = temp[['enc']].values[:train.shape[0]] test[f'rank_frqenc_{col}'] = temp[['enc']].values[train.shape[0]:] test[col] = test[col].astype('category') train[col] = train[col].astype('category') drop_cols = list(set(drop_cols)) print(len(drop_cols)) train = train.drop(drop_cols, axis=1) test = test.drop(drop_cols, axis=1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) gc.collect() print(train.shape, test.shape) test['encounter_id'] = test_id test = test.sort_values('encounter_id').reset_index(drop=True) fe_name = 'fe_siavrez' Data.dump(train, f'../input/pickle/X_train_{fe_name}.pkl') # Data.dump(y, f'../input/pickle/y_train_{fe_name}.pkl') Data.dump(test.drop('encounter_id', axis=1), f'../input/pickle/X_test_{fe_name}.pkl')
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and 'adversarial_validation' in self.advanced: X_train = self.X_train X_test = self.X_test X_train['target'] = 0 X_test['target'] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train['target'] X_train.drop('target', axis=1, inplace=True) X_test.drop('target', axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == 'log_loss': cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == 'mean_absolute_error': cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == 'rmse': cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == 'auc': cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == 'prauc': cv_score = average_precision_score(self.y_train, preds) logger.info(f'{self.run_name} - end training cv - score {cv_score}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # mlflow self.run_id = mlflow.active_run().info.run_id log_param('model_name', str(self.model_cls).split('.')[-1][:-2]) log_param('fe_name', self.fe_name) log_param('train_params', self.params) log_param('cv_strategy', str(self.cv)) log_param('evaluation_metric', self.evaluation_metric) log_metric('cv_score', cv_score) log_param( 'fold_scores', dict( zip([f'fold_{i}' for i in range(len(scores))], [round(s, 4) for s in scores]))) log_param('cols_definition', self.cols_definition) log_param('description', self.description) mlflow.end_run()
ython select_features.py --n 100 """ import argparse import pandas as pd from ayniy.utils import Data parser = argparse.ArgumentParser() parser.add_argument('--n') args = parser.parse_args() fe_id = 'fe005' run_id = 'run046' N_FEATURES = int(args.n) fe_name = f'fe005_top{N_FEATURES}' X_train = Data.load(f'../input/X_train_{fe_id}.pkl') y_train = Data.load(f'../input/y_train_{fe_id}.pkl') X_test = Data.load(f'../input/X_test_{fe_id}.pkl') fi = pd.read_csv( f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES] X_train = X_train[fi] X_test = X_test[fi] Data.dump(X_train, f'../input/X_train_{fe_name}.pkl') Data.dump(y_train, f'../input/y_train_{fe_name}.pkl') Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
import numpy as np import pandas as pd from ayniy.utils import Data train = pd.read_csv('../input/train_data.csv') y_train = train['y'] y_train = np.log1p(y_train) Data.dump(y_train, '../input/y_train_fe000.pkl')