def preprocess(fe_cfg: Config): fe_name = fe_cfg.basic.name target_col = fe_cfg.column.target train_path = f"{DataPath.interim.train}.jbl" test_path = f"{DataPath.interim.test}.jbl" for path, is_train in zip([train_path, test_path], [True, False]): df = Jbl.load(path) if "frame" in fe_cfg.__annotations__: if "window" in fe_cfg.frame.__annotations__: frame_column = fe_cfg.frame.column frame_window = fe_cfg.frame.window df = _filter_frame_window(df, frame_column, frame_window) else: frame_start_q = fe_cfg.frame.start frame_end_q = fe_cfg.frame.end df = _filter_frame(df, frame_start_q, frame_end_q) df_processed = _build_features(df, is_train, fe_cfg) if is_train: X = df_processed.drop(target_col, axis=1) y = df_processed[target_col] else: X = df_processed.copy() y = None X_save_path = (f"{DataPath.processed.X_train}_{fe_name}.jbl" if is_train else f"{DataPath.processed.X_test}_{fe_name}.jbl") Jbl.save(X, X_save_path) if is_train: y_save_path = f"{DataPath.processed.y_train}_{fe_name}.jbl" Jbl.save(y, y_save_path)
def join_data(): train_files = os.listdir(DataPath.raw.train_dir) test_files = os.listdir(DataPath.raw.test_dir) for files, is_train in zip([train_files, test_files], [True, False]): dfs = [] for f in files: if is_train: path = f"{DataPath.raw.train_dir}/{f}" else: path = f"{DataPath.raw.test_dir}/{f}" df_tmp = pd.read_csv(path) stem = os.path.splitext(f)[0] if int(stem) < 400: df_tmp["is_screen_play"] = 1 else: df_tmp["is_screen_play"] = 0 df_tmp["filename"] = stem dfs.append(df_tmp) df = pd.concat(dfs, axis=0, ignore_index=True) df = df.sort_values(["filename", "frame"]).reset_index(drop=True) if not is_train: df = df.drop("is_screen_play", axis=1) del dfs gc.collect() save_path = (f"{DataPath.interim.train}.jbl" if is_train else f"{DataPath.interim.test}.jbl") Jbl.save(df, save_path)
def save_model(self, model_path: str) -> None: """モデルの保存を行う :param path: モデルの保存先パス """ model_path_dir = os.path.dirname(model_path) mkdir(model_path_dir) Jbl.save(self.model, model_path)
def __init__(self, cfgs: Dict[str, Config], logger): super().__init__(cfgs, logger) self.X_train = Jbl.load( f"{DataPath.processed.X_train}_{self.fe_name}.jbl") self.y_train = Jbl.load( f"{DataPath.processed.y_train}_{self.fe_name}.jbl") self.X_test = Jbl.load( f"{DataPath.processed.X_test}_{self.fe_name}.jbl") self.best_threshold = 0.0
def __init__(self, cfgs: Dict[str, Config], logger: logging.Logger): blend_cfg = cfgs["blend"] self.description = blend_cfg.basic.description self.exp_name = blend_cfg.basic.exp_name self.run_name = blend_cfg.basic.name self.run_id = None self.fe_name = blend_cfg.basic.fe_name self.run_cfg = blend_cfg self.params = blend_cfg.params self.cv = generate_cv(blend_cfg) self.column = blend_cfg.column self.cat_cols = (blend_cfg.column.categorical if "categorical" in blend_cfg.column.__annotations__ else None) self.kfold = blend_cfg.kfold self.evaluation_metric = blend_cfg.model.eval_metric self.logger = logger @dataclass class advanced: PseudoRunner: PseudoRunner = None ResRunner: ResRunner = None AdversarialValidation: AdversarialValidation = None Selector: Selector = None self.advanced = advanced if blend_cfg.model.name in models_map.keys(): self.model_cls = models_map[blend_cfg.model.name] else: raise ValueError(f"model_name {self.model_cls} not found") trs = [] tes = [] for run_name, _ in blend_cfg.result.__annotations__.items(): tr = Jbl.load(f"{ModelPath.prediction}/{run_name}-train.jbl") te = Jbl.load(f"{ModelPath.prediction}/{run_name}-test.jbl") trs.append(tr) tes.append(te) train = pd.DataFrame(trs).T train.columns = list(blend_cfg.result.__annotations__.keys()) test = pd.DataFrame(tes).T test.columns = list(blend_cfg.result.__annotations__.keys()) target = [1] * 400 + [0] * (1528 - 400) train["y"] = target self.X_train = train.drop("y", axis=1) self.y_train = train["y"] self.X_test = test.copy() self.best_threshold = 0.0
def submission(self): if self.advanced and "separate" in self.advanced.__annotations__: sub = Jbl.load( f"{DataPath.processed.prefix}/X_test_{self.fe_name}.jbl" ).loc[:, [self.separate_col]] separate_col_uniques = sub[self.separate_col].unique() results = {} for separate_col_val in separate_col_uniques: pred = Jbl.load( f"{ModelPath.prediction}/{self.run_name}-{separate_col_val}-test.jbl" ) sub_separate_idx = sub[sub[self.separate_col] == separate_col_val].index result = { idx_: [p_] for idx_, p_ in zip(sub_separate_idx, pred) } results.update(result) sub = (pd.DataFrame(results).T.reset_index().rename( columns={ "index": "id", 0: self.column.target }).sort_values("id").reset_index(drop=True)) sub.loc[:, "id"] = ( Jbl.load(f"{DataPath.interim.test}").loc[:, ["id"]].values) pred = sub[self.column.target].values else: # sub = Jbl.load(f"{DataPath.interim.test}").loc[:, ["id"]] # pred = Jbl.load(f"{ModelPath.prediction}/{self.run_name}-test.jbl") sub = pd.DataFrame() pred = Jbl.load( f"{ModelPath.prediction}/{self.run_name}-test-binarized.jbl") pred = pred.reshape(-1, ) if self.advanced and "predict_exp" in self.advanced.__annotations__: sub[self.column.target] = np.exp(pred) else: sub[self.column.target] = pred # sub.to_csv( # f"{DataPath.submission}/submission_{self.run_name}.csv", index=False, # ) sub.to_csv( f"{ModelPath.submission}/submission_{self.run_name}.csv", index=False, header=None, )
def submission(self): pred = Jbl.load( f"{PATH['prefix']['prediction']}/{self.run_name}-test.jbl") sub = Loader().load_test().loc[:, ["id"]] if self.advanced and "predict_exp" in self.advanced: sub[self.cols_definition["target_col"]] = np.exp(pred) else: sub[self.cols_definition["target_col"]] = pred sub.to_csv( f"{PATH['prefix']['submission']}/submission_{self.run_name}.csv", index=False, )
def __init__(self, config: dict, cv): self.exp_name = config["exp_name"] self.run_name = config["run_name"] self.run_id = None self.fe_name = config["fe_name"] self.X_train = Jbl.load( f"{PATH['prefix']['processed']}/X_train_{config['fe_name']}.jbl") self.y_train = Jbl.load( f"{PATH['prefix']['processed']}/y_train_{config['fe_name']}.jbl") self.X_test = Jbl.load( f"{PATH['prefix']['processed']}/X_test_{config['fe_name']}.jbl") self.evaluation_metric = config["evaluation_metric"] self.params = config["params"] self.cols_definition = config["cols_definition"] self.kfold = config["kfold"]["method"] self.cv = cv self.description = config["description"] self.advanced = config["advanced"] if "advanced" in config else None if config["model_name"] in models_map.keys(): self.model_cls = models_map[config["model_name"]] else: raise ValueError
def train_fold(self, i_fold: int): """クロスバリデーションでのfoldを指定して学習・評価を行う 他のメソッドから呼び出すほか、単体でも確認やパラメータ調整に用いる :param i_fold: foldの番号(すべてのときには'all'とする) :return: (モデルのインスタンス、レコードのインデックス、予測値、評価によるスコア)のタプル """ # 学習データの読込 X_train = self.X_train.copy() y_train = self.y_train.copy() # 残差の設定 if self.advanced and "ResRunner" in self.advanced: oof = Jbl.load(self.advanced["ResRunner"]["oof"]) X_train["res"] = (y_train - oof).abs() # 学習データ・バリデーションデータをセットする tr_idx, va_idx = self.load_index_fold(i_fold) X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx] X_val, y_val = X_train.iloc[va_idx], y_train.iloc[va_idx] # 残差でダウンサンプリング if self.advanced and "ResRunner" in self.advanced: X_tr = X_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] y_tr = y_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] print(X_tr.shape) X_tr.drop("res", axis=1, inplace=True) X_val.drop("res", axis=1, inplace=True) # Pseudo Lebeling if self.advanced and "PseudoRunner" in self.advanced: y_test_pred = Jbl.load( self.advanced["PseudoRunner"]["y_test_pred"]) if "pl_threshold" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) elif "pl_threshold_neg" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) else: X_add = self.X_test y_add = pd.DataFrame(y_test_pred) print(f"added X_test: {len(X_add)}") X_tr = pd.concat([X_tr, X_add]) y_tr = pd.concat([y_tr, y_add]) # 学習を行う model = self.build_model(i_fold) model.train(X_tr, y_tr, X_val, y_val, self.X_test) # バリデーションデータへの予測・評価を行う pred_val = model.predict(X_val) # 後処理 pred_val = postprocess(pred_val) score = self.evaluate(y_val.values, pred_val) # モデル、インデックス、予測値、評価を返す return model, va_idx, pred_val, score
def run_predict_cv(self) -> None: """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う あらかじめrun_train_cvを実行しておく必要がある """ logger.info(f"{self.run_name} - start prediction cv") X_test = self.X_test preds = [] show_feature_importance = "LGBM" in str(self.model_cls) if show_feature_importance: feature_importances = pd.DataFrame() # 各foldのモデルで予測を行う for i_fold in range(self.cv.n_splits): logger.info(f"{self.run_name} - start prediction fold:{i_fold}") model = self.build_model(i_fold) model.load_model() pred = model.predict(X_test) preds.append(pred) logger.info(f"{self.run_name} - end prediction fold:{i_fold}") if show_feature_importance: feature_importances = pd.concat( [feature_importances, model.feature_importance(X_test)], axis=0) # 予測の平均値を出力する pred_avg = np.mean(preds, axis=0) # 予測結果の保存 Jbl.save(pred_avg, f"{PATH['prefix']['prediction']}/{self.run_name}-test.jbl") logger.info(f"{self.run_name} - end prediction cv") # 特徴量の重要度 if show_feature_importance: aggs = (feature_importances.groupby("Feature").mean().sort_values( by="importance", ascending=False)) cols = aggs[:200].index pd.DataFrame(aggs.index).to_csv( f"{PATH['prefix']['importance']}/{self.run_name}-fi.csv", index=False) best_features = feature_importances.loc[ feature_importances.Feature.isin(cols)] plt.figure(figsize=(14, 26)) sns.barplot( x="importance", y="Feature", data=best_features.sort_values(by="importance", ascending=False), ) plt.title("LightGBM Features (averaged over folds)") plt.tight_layout() plt.savefig( f"{PATH['prefix']['importance']}/{self.run_name}-fi.png") plt.show() # mlflow mlflow.start_run(run_id=self.run_id) log_artifact( f"{PATH['prefix']['importance']}/{self.run_name}-fi.png") mlflow.end_run()
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f"{self.run_name} - start training cv") scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and "adversarial_validation" in self.advanced: X_train = self.X_train X_test = self.X_test X_train["target"] = 0 X_test["target"] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train["target"] X_train.drop("target", axis=1, inplace=True) X_test.drop("target", axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f"{self.run_name} fold {i_fold} - start training") model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f"{self.run_name} fold {i_fold} - end training - score {score}\tbest_iteration: {model.model.best_iteration}" ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] cv_score = self.evaluate(self.y_train.values, preds) logger.info( f"{self.run_name} - end training cv - score {cv_score}\tbest_iteration: {model.model.best_iteration}" ) # 予測結果の保存 Jbl.save(preds, f"{PATH['prefix']['prediction']}/{self.run_name}-train.jbl") # mlflow self.run_id = mlflow.active_run().info.run_id log_param("model_name", self.model_cls.__class__.__name__) log_param("fe_name", self.fe_name) log_param("train_params", self.params) log_param("cv_strategy", str(self.cv)) log_param("evaluation_metric", self.evaluation_metric) log_metric("cv_score", cv_score) log_param( "fold_scores", dict( zip( [f"fold_{i}" for i in range(len(scores))], [round(s, 4) for s in scores], )), ) log_param("cols_definition", self.cols_definition) log_param("description", self.description) mlflow.end_run()
def load_model(self, path: str = "models/model"): model_path = os.path.join(path, f"{self.run_fold_name}.model") self.model = Jbl.load(model_path)
def save_model(self, path: str = "models/model"): model_path = os.path.join(path, f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) Jbl.save(self.model, model_path)
def load_model(self, model_path: str) -> None: """モデルの読み込みを行う :param path: モデルの読み込み先パス """ self.model = Jbl.load(model_path)
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) self.logger.info(f"{self.run_name} - start training cv") scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and self.advanced.AdversarialValidation is not None: X_train = self.X_train.copy() X_test = self.X_test.copy() X_train["target"] = 0 X_test["target"] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train["target"] X_train.drop("target", axis=1, inplace=True) X_test.drop("target", axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 特徴量選択 if self.advanced and self.advanced.Selector is not None: self.logger.info(f"{self.run_name} - start feature_selection") self.logger.info( f"{self.run_name} - #features before selection: {len(self.X_train.columns.tolist())}" ) selector_params = dataclasses.asdict(self.advanced.Selector) selector_name = selector_params.pop("name") selector = None if selector_name == "GBDTFeatureSelector": selector = GBDTFeatureSelector( input_cols=self.X_train.columns.tolist(), target_col=self.column.target, **selector_params, ) else: ValueError(f"{selector_name} is not implemented") self.X_train = selector.fit_transform( pd.concat([self.X_train, self.y_train], axis=1)) self.X_test = selector.transform(self.X_test) self.logger.info( f"{self.run_name} - #features after selection: {len(self.X_train.columns.tolist())}" ) self.logger.info(f"{self.run_name} - end feature_selection") os.makedirs(f"{ModelPath.selector}", exist_ok=True) Jbl.save(selector, f"{ModelPath.selector}/{self.run_name}.selector") # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う self.logger.info(f"{self.run_name} fold {i_fold} - start training") model, va_idx, va_pred, score = self.train_fold(i_fold) fold_score_log = ( f"{self.run_name} fold {i_fold} - end training - score {score}" ) if hasattr(model.model, "best_iteration"): fold_score_log += f"\tbest_iteration: {model.model.best_iteration}" self.logger.info(fold_score_log) self.logger.info( f"{self.run_name} fold {i_fold} - best threshold - {self.best_threshold}" ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] cv_score = self.evaluate(self.y_train.values, preds) preds_binarized = np.where(preds > self.best_threshold, 1, 0) self.logger.info( f"{self.run_name} - end training cv - score {cv_score}") self.logger.info( f"{self.run_name} - best threshold - {self.best_threshold}") # 予測結果の保存 Jbl.save(preds, f"{ModelPath.prediction}/{self.run_name}-train.jbl") Jbl.save( preds_binarized, f"{ModelPath.prediction}/{self.run_name}-train-binarized.jbl", ) Jbl.save( self.best_threshold, f"{ModelPath.prediction}/{self.run_name}-best-threshold.jbl", ) # mlflow self.mlflow(cv_score, scores)
def load_model(self): model_path = os.path.join(f"{ModelPath.model}", f"{self.run_fold_name}.model") self.model = Jbl.load(model_path)
def save_model(self): model_path = os.path.join(f"{ModelPath.model}", f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) Jbl.save(self.model, model_path)
def save_model(self, path: str = "models/model"): model_path = os.path.join(path, f"{self.run_fold_name}.model") Jbl.save(self.model, model_path) print(f"{model_path} is saved")
def run_predict_cv(self) -> None: """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う あらかじめrun_train_cvを実行しておく必要がある """ self.logger.info(f"{self.run_name} - start prediction cv") X_test = self.X_test.copy() if self.advanced and self.advanced.Selector is not None: selector = Jbl.load( f"{ModelPath.selector}/{self.run_name}.selector") X_test = selector.transform(X_test) preds = [] show_feature_importance = "LGBM" in str(self.model_cls) feature_importances = pd.DataFrame() # 各foldのモデルで予測を行う for i_fold in range(self.cv.n_splits): self.logger.info( f"{self.run_name} - start prediction fold:{i_fold}") if self.pretrain is None: model = self.build_model(i_fold) else: model = self.model_cls(f"{self.pretrain.run_name}-{i_fold}", self.run_cfg, self.cat_cols) model.load_model() pred = model.predict(X_test) # 後処理 pred = postprocess_prediction(pred) preds.append(pred) self.logger.info(f"{self.run_name} - end prediction fold:{i_fold}") if show_feature_importance: feature_importances = pd.concat( [feature_importances, model.feature_importance(X_test)], axis=0) # 予測の平均値を出力する pred_avg = np.mean(preds, axis=0) # 閾値で2値化 if self.pretrain is None: best_threshold = Jbl.load( f"{ModelPath.prediction}/{self.run_name}-best-threshold.jbl") else: best_threshold = Jbl.load( f"{ModelPath.prediction}/{self.pretrain.run_name}-best-threshold.jbl" ) pred_avg_binarized = np.where(pred_avg > best_threshold, 1, 0) # 予測結果の保存 Jbl.save(pred_avg, f"{ModelPath.prediction}/{self.run_name}-test.jbl") Jbl.save( pred_avg_binarized, f"{ModelPath.prediction}/{self.run_name}-test-binarized.jbl", ) self.logger.info(f"{self.run_name} - end prediction cv") # 特徴量の重要度 if show_feature_importance: aggs = (feature_importances.groupby("Feature").mean().sort_values( by="importance", ascending=False)) cols = aggs[:200].index pd.DataFrame(aggs.index).to_csv( f"{ModelPath.importance}/{self.run_name}-fi.csv", index=False) best_features = feature_importances.loc[ feature_importances.Feature.isin(cols)] plt.figure(figsize=(14, 26)) sns.barplot( x="importance", y="Feature", data=best_features.sort_values(by="importance", ascending=False), ) plt.title("LightGBM Features (averaged over folds)") plt.tight_layout() plt.savefig(f"{ModelPath.importance}/{self.run_name}-fi.png") plt.show() # mlflow mlflow.start_run(run_id=self.run_id) log_artifact(f"{ModelPath.importance}/{self.run_name}-fi.png") mlflow.end_run()