def load_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') pred = Data.load(f'../output/pred/{run_id}-test.pkl') if to_rank: oof = rankdata(oof) / len(oof) pred = rankdata(pred) / len(pred) return (oof, pred)
def run_predict_cv(self) -> None: """クロスバリデーションで学習した各foldのモデルの平均により、テストデータの予測を行う あらかじめrun_train_cvを実行しておく必要がある """ logger.info(f'{self.run_name} - start prediction cv') X_test = self.X_test preds = [] show_feature_importance = 'LGBM' in str(self.model_cls) if show_feature_importance: feature_importances = pd.DataFrame() # 各foldのモデルで予測を行う for i_fold in range(self.cv.n_splits): logger.info(f'{self.run_name} - start prediction fold:{i_fold}') model = self.build_model(i_fold) model.load_model() pred = model.predict(X_test) preds.append(pred) logger.info(f'{self.run_name} - end prediction fold:{i_fold}') if show_feature_importance: feature_importances = pd.concat( [feature_importances, model.feature_importance(X_test)], axis=0) # 予測の平均値を出力する pred_avg = np.mean(preds, axis=0) # 予測結果の保存 Data.dump(pred_avg, f'../output/pred/{self.run_name}-test.pkl') logger.info(f'{self.run_name} - end prediction cv') # 特徴量の重要度 if show_feature_importance: aggs = feature_importances.groupby('Feature').mean().sort_values( by="importance", ascending=False) cols = aggs[:200].index pd.DataFrame(aggs.index).to_csv( f'../output/importance/{self.run_name}-fi.csv', index=False) best_features = feature_importances.loc[ feature_importances.Feature.isin(cols)] plt.figure(figsize=(14, 26)) sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LightGBM Features (averaged over folds)') plt.tight_layout() plt.savefig(f'../output/importance/{self.run_name}-fi.png') plt.show() # mlflow mlflow.start_run(run_id=self.run_id) log_artifact(f'../output/importance/{self.run_name}-fi.png') mlflow.end_run()
def load(self): X_train = Data.load( join(self.output_dir, f"X_train_{self.run_name}.pkl")) y_train = Data.load( join(self.output_dir, f"y_train_{self.run_name}.pkl")) X_test = Data.load(join(self.output_dir, f"X_test_{self.run_name}.pkl")) return X_train, X_test, y_train
def make_predictions(data: list, weights: list): pred = 0 for i, d in enumerate(data): if i < len(weights): pred += d[1] * weights[i] else: pred += d[1] * (1 - sum(weights)) Data.dump(pred, f'../output/pred/{run_name}-test.pkl') return pred
def load_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') pred = Data.load(f'../output/pred/{run_id}-test.pkl') if run_id in ('run015'): oof = oof.reshape(-1, ) pred = pred.reshape(-1, ) if to_rank: oof = rankdata(oof) / len(oof) pred = rankdata(pred) / len(pred) return (oof, pred)
def f(x): pred = 0 for i, d in enumerate(data): if i < len(x): pred += d[0] * x[i] else: pred += d[0] * (1 - sum(x)) score = np.sqrt(mean_squared_error(y_train, pred)) Data.dump(pred, f'../output/pred/{run_name}-train.pkl') return score
def f(x): pred = 0 for i, d in enumerate(data): if i < len(x): pred += d[0] * x[i] else: pred += d[0] * (1 - sum(x)) score = -1 * roc_auc_score(y_train, pred) Data.dump(pred, f'../output/pred/{run_name}-train.pkl') return score
def submission(self) -> None: pred = Data.load(f"../output/pred/{self.run_name}-test.pkl") sub = pd.read_csv(self.sample_submission) if self.evaluation_metric == "log_loss": sub[self.cols_definition["target_col"]] = np.argmax(pred, axis=1) else: oof = Data.load(f"../output/pred/{self.run_name}-train.pkl") oof = np.array([convert(v) for v in oof]) pred = np.array([convert(v) for v in pred]) sub[self.cols_definition["target_col"]] = pred sub[self.cols_definition["target_col"]] = sub[ self.cols_definition["target_col"]].astype(float) sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv", index=False)
def load_oof_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') if run_id in ('run091', 'run092', 'run097'): oof = oof.reshape(-1, ) if to_rank: oof = rankdata(oof) / len(oof) return oof
def load_pred_from_run_id(run_id: str, to_rank: False): pred = Data.load(f'../output/pred/{run_id}-test.pkl') if run_id in ('run091', 'run092', 'run097'): pred = pred.reshape(-1, ) if to_rank: pred = rankdata(pred) / len(pred) return pred
def submission(self): pred = Data.load(f'../output/pred/{self.run_name}-test.pkl') sub = pd.read_csv(self.sample_submission) if self.advanced and 'predict_exp' in self.advanced: sub[self.cols_definition['target_col']] = np.expm1(pred) else: sub[self.cols_definition['target_col']] = pred sub.to_csv(f'../output/submissions/submission_{self.run_name}.csv', index=False)
def submission(self) -> None: pred = Data.load(f"../output/pred/{self.run_name}-test.pkl") sub = pd.read_csv(self.sample_submission) if self.advanced and "predict_exp" in self.advanced: sub[self.cols_definition["target_col"]] = np.exp(pred) else: sub[self.cols_definition["target_col"]] = pred sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv", index=False)
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # 各foldで学習を行う for i_fold in range(self.n_fold): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] logger.info( f'{self.run_name} - end training cv - score {np.mean(scores)}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # 評価結果の保存 logger.result_scores(self.run_name, scores)
def __init__(self, configs: Dict, cv) -> None: # type: ignore self.exp_name = configs["exp_name"] self.run_name = configs["run_name"] self.run_id = None self.fe_name = configs["fe_name"] self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl") self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl") self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl") self.evaluation_metric = configs["evaluation_metric"] self.params = configs["params"] self.cols_definition = configs["cols_definition"] self.cv = cv self.sample_submission = configs["data"]["sample_submission"] self.description = configs["description"] self.advanced = configs["advanced"] if "advanced" in configs else None if configs["model_name"] in models_map.keys(): self.model_cls = models_map[configs["model_name"]] else: raise ValueError
def __init__(self, configs: dict, cv): self.exp_name = configs['exp_name'] self.run_name = configs['run_name'] self.run_id = None self.fe_name = configs['fe_name'] self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl") self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl") self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl") self.evaluation_metric = configs['evaluation_metric'] self.params = configs['params'] self.cols_definition = configs['cols_definition'] self.cv = cv self.sample_submission = configs['data']['sample_submission'] self.description = configs['description'] self.advanced = configs['advanced'] if 'advanced' in configs else None if configs['model_name'] in models_map.keys(): self.model_cls = models_map[configs['model_name']] else: raise ValueError
def save_as_pickle( train: pd.DataFrame, test: pd.DataFrame, target_col: str, exp_id: str, output_dir: str = '../input') -> Tuple[pd.DataFrame, pd.DataFrame]: """Save X_train, X_test and y_train as pickel format Args: train (pd.DataFrame): train test (pd.DataFrame): test target_col (str): target column exp_id (str): experiment id output_dir (str, optional): output directory. Defaults to '../input'. Returns: Tuple[pd.DataFrame, pd.DataFrame]: train, test """ X_train = train.drop(target_col, axis=1) y_train = train[target_col] if target_col in test.columns: X_test = test.drop(target_col, axis=1) else: X_test = test Data.dump(X_train, join(output_dir, f"X_train_{exp_id}.pkl")) Data.dump(y_train, join(output_dir, f"y_train_{exp_id}.pkl")) Data.dump(X_test, join(output_dir, f"X_test_{exp_id}.pkl"))
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame, col_definition: dict, option: dict): """ col_definition: target_col option: exp_id """ X_train = train.drop(col_definition['target_col'], axis=1) y_train = train[col_definition['target_col']] if col_definition['target_col'] in test.columns: X_test = test.drop(col_definition['target_col'], axis=1) else: X_test = test Data.dump(X_train, join('../input', f"X_train{option['exp_id']}.pkl")) Data.dump(y_train, join('../input', 'y_train.pkl')) Data.dump(X_test, join('../input', f"X_test{option['exp_id']}.pkl"))
def save_as_pickle(train: pd.DataFrame, test: pd.DataFrame, col_definition: dict, option: dict): """ col_definition: target_col option: output_dir, exp_id """ X_train = train.drop(col_definition['target_col'], axis=1) y_train = train[col_definition['target_col']] if col_definition['target_col'] in test.columns: X_test = test.drop(col_definition['target_col'], axis=1) else: X_test = test Data.dump(X_train, join(option['output_dir'], f"X_train_{option['exp_id']}.pkl")) Data.dump(y_train, join(option['output_dir'], f"y_train_{option['exp_id']}.pkl")) Data.dump(X_test, join(option['output_dir'], f"X_test_{option['exp_id']}.pkl"))
import japanize_matplotlib import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from ayniy.utils import Data if __name__ == '__main__': oof = Data.load('../output/pred/run003-train.pkl') train_player = pd.read_csv('../input/train_player.csv') train_pitch = pd.read_csv('../input/train_pitch.csv') train_pitch = train_pitch[train_pitch['試合種別詳細'] != 'パ・リーグ公式戦'].reset_index( drop=True) # 投手情報の紐付け train = pd.merge(train_pitch, train_player, left_on=['年度', '投手ID'], right_on=['年度', '選手ID'], how='inner') # 打者情報の紐付け train = pd.merge(train, train_player, left_on=['年度', '打者ID'], right_on=['年度', '選手ID'], how='inner', suffixes=('_p', '_b')) X_train, _, _, _ = train_test_split(train.drop('試合種別詳細', axis=1),
else: pred += d[1] * (1 - sum(weights)) Data.dump(pred, f'../output/pred/{run_name}-test.pkl') return pred def make_submission(pred, run_name: str): sub = pd.read_csv('../input/solafune-light/UploadFileTemplate.csv') sub['LandPrice'] = np.expm1(pred) sub.to_csv(f'../output/submissions/submission_{run_name}.csv', index=False) run_ids = [ 'run004', 'run005', ] run_name = 'weight001' if __name__ == '__main__': y_train = Data.load('../input/pickle/y_train_fe000.pkl') data = [load_from_run_id(ri, to_rank=False) for ri in run_ids] for d in data: print(np.sqrt(mean_squared_error(y_train, d[0]))) init_state = [round(1 / len(data), 3) for _ in range(len(data) - 1)] result = minimize(f, init_state, method='Nelder-Mead') print('optimized CV: ', result['fun']) print('w: ', result['x']) make_submission(make_predictions(data, result['x']), run_name)
def train_fold(self, i_fold: int) -> Tuple[Any, Any, Any, Any]: """クロスバリデーションでのfoldを指定して学習・評価を行う 他のメソッドから呼び出すほか、単体でも確認やパラメータ調整に用いる :param i_fold: foldの番号(すべてのときには'all'とする) :return: (モデルのインスタンス、レコードのインデックス、予測値、評価によるスコア)のタプル """ # 学習データの読込 X_train = self.X_train y_train = self.y_train # 残差の設定 if self.advanced and "ResRunner" in self.advanced: oof = Data.load(self.advanced["ResRunner"]["oof"]) X_train["res"] = (y_train - oof).abs() # 学習データ・バリデーションデータをセットする tr_idx, va_idx = self.load_index_fold(i_fold) X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx] X_val, y_val = X_train.iloc[va_idx], y_train.iloc[va_idx] # 残差でダウンサンプリング if self.advanced and "ResRunner" in self.advanced: X_tr = X_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] y_tr = y_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] print(X_tr.shape) X_tr.drop("res", axis=1, inplace=True) X_val.drop("res", axis=1, inplace=True) # Pseudo Lebeling if self.advanced and "PseudoRunner" in self.advanced: y_test_pred = Data.load( self.advanced["PseudoRunner"]["y_test_pred"]) if "pl_threshold" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) elif "pl_threshold_neg" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) else: X_add = self.X_test y_add = pd.DataFrame(y_test_pred) print(f"added X_test: {len(X_add)}") X_tr = pd.concat([X_tr, X_add]) y_tr = pd.concat([y_tr, y_add]) # 学習を行う model = self.build_model(i_fold) model.train(X_tr, y_tr, X_val, y_val, self.X_test) # type: ignore # バリデーションデータへの予測・評価を行う pred_val = model.predict(X_val) if self.evaluation_metric == "log_loss": score = log_loss(y_val, pred_val, eps=1e-15, normalize=True) elif self.evaluation_metric == "mean_absolute_error": score = mean_absolute_error(y_val, pred_val) elif self.evaluation_metric == "rmse": score = np.sqrt(mean_squared_error(y_val, pred_val)) elif self.evaluation_metric == "auc": score = roc_auc_score(y_val, pred_val) elif self.evaluation_metric == "prauc": score = average_precision_score(y_val, pred_val) # モデル、インデックス、予測値、評価を返す return model, va_idx, pred_val, score
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f"{self.run_name} - start training cv") scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and "adversarial_validation" in self.advanced: X_train = self.X_train X_test = self.X_test X_train["target"] = 0 X_test["target"] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train["target"] X_train.drop("target", axis=1, inplace=True) X_test.drop("target", axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f"{self.run_name} fold {i_fold} - start training") model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f"{self.run_name} fold {i_fold} - end training - score {score}" ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == "log_loss": cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == "mean_absolute_error": cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == "rmse": cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == "auc": cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == "prauc": cv_score = average_precision_score(self.y_train, preds) logger.info(f"{self.run_name} - end training cv - score {cv_score}") # 予測結果の保存 Data.dump(preds, f"../output/pred/{self.run_name}-train.pkl") # mlflow self.run_id = mlflow.active_run().info.run_id log_param("model_name", self.model_cls.__class__.__name__) log_param("fe_name", self.fe_name) log_param("train_params", self.params) log_param("cv_strategy", str(self.cv)) log_param("evaluation_metric", self.evaluation_metric) log_metric("cv_score", cv_score) log_param( "fold_scores", dict( zip([f"fold_{i}" for i in range(len(scores))], [round(s, 4) for s in scores])), ) log_param("cols_definition", self.cols_definition) log_param("description", self.description) mlflow.end_run()
lbl = OrdinalEncoder(dtype='int') train[col] = lbl.fit_transform(train[col].astype('str').fillna('-1').values.reshape(-1, 1)) test[col] = lbl.transform(test[col].astype('str').fillna('-1').values.reshape(-1, 1)) temp = pd.concat([train[[col]], test[[col]]], axis=0) temp_mapping = temp.groupby(col).size() / len(temp) temp['enc'] = temp[col].map(temp_mapping) temp['enc'] = stats.rankdata(temp['enc']) temp = temp.reset_index(drop=True) train[f'rank_frqenc_{col}'] = temp[['enc']].values[:train.shape[0]] test[f'rank_frqenc_{col}'] = temp[['enc']].values[train.shape[0]:] test[col] = test[col].astype('category') train[col] = train[col].astype('category') drop_cols = list(set(drop_cols)) print(len(drop_cols)) train = train.drop(drop_cols, axis=1) test = test.drop(drop_cols, axis=1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) gc.collect() print(train.shape, test.shape) test['encounter_id'] = test_id test = test.sort_values('encounter_id').reset_index(drop=True) fe_name = 'fe_siavrez' Data.dump(train, f'../input/pickle/X_train_{fe_name}.pkl') # Data.dump(y, f'../input/pickle/y_train_{fe_name}.pkl') Data.dump(test.drop('encounter_id', axis=1), f'../input/pickle/X_test_{fe_name}.pkl')
def run_train_cv(self) -> None: """クロスバリデーションでの学習・評価を行う 学習・評価とともに、各foldのモデルの保存、スコアのログ出力についても行う """ # mlflow mlflow.set_experiment(self.exp_name) mlflow.start_run(run_name=self.run_name) logger.info(f'{self.run_name} - start training cv') scores = [] va_idxes = [] preds = [] # Adversarial validation if self.advanced and 'adversarial_validation' in self.advanced: X_train = self.X_train X_test = self.X_test X_train['target'] = 0 X_test['target'] = 1 X_train = pd.concat([X_train, X_test], sort=False).reset_index(drop=True) y_train = X_train['target'] X_train.drop('target', axis=1, inplace=True) X_test.drop('target', axis=1, inplace=True) self.X_train = X_train self.y_train = y_train # 各foldで学習を行う for i_fold in range(self.cv.n_splits): # 学習を行う logger.info(f'{self.run_name} fold {i_fold} - start training') model, va_idx, va_pred, score = self.train_fold(i_fold) logger.info( f'{self.run_name} fold {i_fold} - end training - score {score}' ) # モデルを保存する model.save_model() # 結果を保持する va_idxes.append(va_idx) scores.append(score) preds.append(va_pred) # 各foldの結果をまとめる va_idxes = np.concatenate(va_idxes) order = np.argsort(va_idxes) preds = np.concatenate(preds, axis=0) preds = preds[order] if self.evaluation_metric == 'log_loss': cv_score = log_loss(self.y_train, preds, eps=1e-15, normalize=True) elif self.evaluation_metric == 'mean_absolute_error': cv_score = mean_absolute_error(self.y_train, preds) elif self.evaluation_metric == 'rmse': cv_score = np.sqrt(mean_squared_error(self.y_train, preds)) elif self.evaluation_metric == 'auc': cv_score = roc_auc_score(self.y_train, preds) elif self.evaluation_metric == 'prauc': cv_score = average_precision_score(self.y_train, preds) logger.info(f'{self.run_name} - end training cv - score {cv_score}') # 予測結果の保存 Data.dump(preds, f'../output/pred/{self.run_name}-train.pkl') # mlflow self.run_id = mlflow.active_run().info.run_id log_param('model_name', str(self.model_cls).split('.')[-1][:-2]) log_param('fe_name', self.fe_name) log_param('train_params', self.params) log_param('cv_strategy', str(self.cv)) log_param('evaluation_metric', self.evaluation_metric) log_metric('cv_score', cv_score) log_param( 'fold_scores', dict( zip([f'fold_{i}' for i in range(len(scores))], [round(s, 4) for s in scores]))) log_param('cols_definition', self.cols_definition) log_param('description', self.description) mlflow.end_run()
def load_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") self.model = Data.load(model_path)
import numpy as np import pandas as pd import yaml from ayniy.model.model_cat import ModelCatRegressor from ayniy.model.runner import Runner from ayniy.utils import Data X_train = Data.load('../input/X_train_00.pkl') y_train = Data.load('../input/y_train.pkl') X_test = Data.load('../input/X_test_00.pkl') X_train.drop(['fiscal_year'], axis=1, inplace=True) X_test.drop(['fiscal_year'], axis=1, inplace=True) y_train = np.log(np.sqrt(y_train)) f = open("configs/fe_00.yml", "r+") configs = yaml.load(f) categorical_cols = configs['cols_definition']['categorical_col'] params_cat = { 'depth': 6, 'learning_rate': 0.1, 'iterations': 10000, 'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'random_seed': 777, 'allow_writing_files': False, 'task_type': "CPU", 'early_stopping_rounds': 50
def save_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path)
'agg': ['mean', 'sum', 'median', 'min', 'max', 'var', 'std'] }, ], nunique_dict=[ { 'key': ['Sex'], 'var': ['SibSp'], 'agg': ['nunique'] }, { 'key': ['Sex'], 'var': ['Cabin'], 'agg': ['nunique'] }, ]) print(X_train.shape, X_test.shape) unique_cols, duplicated_cols, high_corr_cols = detect_delete_cols( X_train, X_test, escape_col=categorical_cols, threshold=0.99) X_train.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) X_test.drop(unique_cols + duplicated_cols + high_corr_cols, axis=1, inplace=True) print(X_train.shape, X_test.shape) Data.dump(X_train, output_dir + 'X_train_fe000.pkl') Data.dump(X_test, output_dir + 'X_test_fe000.pkl') Data.dump(y_train, output_dir + 'y_train_fe000.pkl')
ython select_features.py --n 100 """ import argparse import pandas as pd from ayniy.utils import Data parser = argparse.ArgumentParser() parser.add_argument('--n') args = parser.parse_args() fe_id = 'fe005' run_id = 'run046' N_FEATURES = int(args.n) fe_name = f'fe005_top{N_FEATURES}' X_train = Data.load(f'../input/X_train_{fe_id}.pkl') y_train = Data.load(f'../input/y_train_{fe_id}.pkl') X_test = Data.load(f'../input/X_test_{fe_id}.pkl') fi = pd.read_csv( f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES] X_train = X_train[fi] X_test = X_test[fi] Data.dump(X_train, f'../input/X_train_{fe_name}.pkl') Data.dump(y_train, f'../input/y_train_{fe_name}.pkl') Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
def save_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") os.makedirs(os.path.dirname(model_path), exist_ok=True) # best_ntree_limitが消えるのを防ぐため、pickleで保存することとした Data.dump(self.model, model_path)