def load_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') pred = Data.load(f'../output/pred/{run_id}-test.pkl') if to_rank: oof = rankdata(oof) / len(oof) pred = rankdata(pred) / len(pred) return (oof, pred)
def load(self): X_train = Data.load( join(self.output_dir, f"X_train_{self.run_name}.pkl")) y_train = Data.load( join(self.output_dir, f"y_train_{self.run_name}.pkl")) X_test = Data.load(join(self.output_dir, f"X_test_{self.run_name}.pkl")) return X_train, X_test, y_train
def load_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') pred = Data.load(f'../output/pred/{run_id}-test.pkl') if run_id in ('run015'): oof = oof.reshape(-1, ) pred = pred.reshape(-1, ) if to_rank: oof = rankdata(oof) / len(oof) pred = rankdata(pred) / len(pred) return (oof, pred)
def submission(self) -> None: pred = Data.load(f"../output/pred/{self.run_name}-test.pkl") sub = pd.read_csv(self.sample_submission) if self.evaluation_metric == "log_loss": sub[self.cols_definition["target_col"]] = np.argmax(pred, axis=1) else: oof = Data.load(f"../output/pred/{self.run_name}-train.pkl") oof = np.array([convert(v) for v in oof]) pred = np.array([convert(v) for v in pred]) sub[self.cols_definition["target_col"]] = pred sub[self.cols_definition["target_col"]] = sub[ self.cols_definition["target_col"]].astype(float) sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv", index=False)
def load_oof_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') if run_id in ('run091', 'run092', 'run097'): oof = oof.reshape(-1, ) if to_rank: oof = rankdata(oof) / len(oof) return oof
def load_pred_from_run_id(run_id: str, to_rank: False): pred = Data.load(f'../output/pred/{run_id}-test.pkl') if run_id in ('run091', 'run092', 'run097'): pred = pred.reshape(-1, ) if to_rank: pred = rankdata(pred) / len(pred) return pred
def submission(self): pred = Data.load(f'../output/pred/{self.run_name}-test.pkl') sub = pd.read_csv(self.sample_submission) if self.advanced and 'predict_exp' in self.advanced: sub[self.cols_definition['target_col']] = np.expm1(pred) else: sub[self.cols_definition['target_col']] = pred sub.to_csv(f'../output/submissions/submission_{self.run_name}.csv', index=False)
def submission(self) -> None: pred = Data.load(f"../output/pred/{self.run_name}-test.pkl") sub = pd.read_csv(self.sample_submission) if self.advanced and "predict_exp" in self.advanced: sub[self.cols_definition["target_col"]] = np.exp(pred) else: sub[self.cols_definition["target_col"]] = pred sub.to_csv(f"../output/submissions/submission_{self.run_name}.csv", index=False)
def __init__(self, configs: dict, cv): self.exp_name = configs['exp_name'] self.run_name = configs['run_name'] self.run_id = None self.fe_name = configs['fe_name'] self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl") self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl") self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl") self.evaluation_metric = configs['evaluation_metric'] self.params = configs['params'] self.cols_definition = configs['cols_definition'] self.cv = cv self.sample_submission = configs['data']['sample_submission'] self.description = configs['description'] self.advanced = configs['advanced'] if 'advanced' in configs else None if configs['model_name'] in models_map.keys(): self.model_cls = models_map[configs['model_name']] else: raise ValueError
def __init__(self, configs: Dict, cv) -> None: # type: ignore self.exp_name = configs["exp_name"] self.run_name = configs["run_name"] self.run_id = None self.fe_name = configs["fe_name"] self.X_train = Data.load(f"../input/X_train_{configs['fe_name']}.pkl") self.y_train = Data.load(f"../input/y_train_{configs['fe_name']}.pkl") self.X_test = Data.load(f"../input/X_test_{configs['fe_name']}.pkl") self.evaluation_metric = configs["evaluation_metric"] self.params = configs["params"] self.cols_definition = configs["cols_definition"] self.cv = cv self.sample_submission = configs["data"]["sample_submission"] self.description = configs["description"] self.advanced = configs["advanced"] if "advanced" in configs else None if configs["model_name"] in models_map.keys(): self.model_cls = models_map[configs["model_name"]] else: raise ValueError
import japanize_matplotlib import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from ayniy.utils import Data if __name__ == '__main__': oof = Data.load('../output/pred/run003-train.pkl') train_player = pd.read_csv('../input/train_player.csv') train_pitch = pd.read_csv('../input/train_pitch.csv') train_pitch = train_pitch[train_pitch['試合種別詳細'] != 'パ・リーグ公式戦'].reset_index( drop=True) # 投手情報の紐付け train = pd.merge(train_pitch, train_player, left_on=['年度', '投手ID'], right_on=['年度', '選手ID'], how='inner') # 打者情報の紐付け train = pd.merge(train, train_player, left_on=['年度', '打者ID'], right_on=['年度', '選手ID'], how='inner', suffixes=('_p', '_b')) X_train, _, _, _ = train_test_split(train.drop('試合種別詳細', axis=1),
import numpy as np import pandas as pd import yaml from ayniy.model.model_cat import ModelCatRegressor from ayniy.model.runner import Runner from ayniy.utils import Data X_train = Data.load('../input/X_train_00.pkl') y_train = Data.load('../input/y_train.pkl') X_test = Data.load('../input/X_test_00.pkl') X_train.drop(['fiscal_year'], axis=1, inplace=True) X_test.drop(['fiscal_year'], axis=1, inplace=True) y_train = np.log(np.sqrt(y_train)) f = open("configs/fe_00.yml", "r+") configs = yaml.load(f) categorical_cols = configs['cols_definition']['categorical_col'] params_cat = { 'depth': 6, 'learning_rate': 0.1, 'iterations': 10000, 'loss_function': 'RMSE', 'eval_metric': 'RMSE', 'random_seed': 777, 'allow_writing_files': False, 'task_type': "CPU", 'early_stopping_rounds': 50
from sklearn.metrics import roc_auc_score, confusion_matrix from ayniy.utils import Data if __name__ == '__main__': pred = Data.load('../output/pred/run003-test.pkl') y_test = Data.load('../input/y_test_fe003.pkl') print(roc_auc_score(y_test, pred)) print(confusion_matrix(y_test, (pred > 0.5).astype(int)))
else: pred += d[1] * (1 - sum(weights)) Data.dump(pred, f'../output/pred/{run_name}-test.pkl') return pred def make_submission(pred, run_name: str): sub = pd.read_csv('../input/solafune-light/UploadFileTemplate.csv') sub['LandPrice'] = np.expm1(pred) sub.to_csv(f'../output/submissions/submission_{run_name}.csv', index=False) run_ids = [ 'run004', 'run005', ] run_name = 'weight001' if __name__ == '__main__': y_train = Data.load('../input/pickle/y_train_fe000.pkl') data = [load_from_run_id(ri, to_rank=False) for ri in run_ids] for d in data: print(np.sqrt(mean_squared_error(y_train, d[0]))) init_state = [round(1 / len(data), 3) for _ in range(len(data) - 1)] result = minimize(f, init_state, method='Nelder-Mead') print('optimized CV: ', result['fun']) print('w: ', result['x']) make_submission(make_predictions(data, result['x']), run_name)
def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
"../input/feather/count_encoding_interact.ftr", "../input/feather/aggregation.ftr", "../input/feather/target_encoding.ftr", ], target_col=target_col, ) X_train_u = features.X_train y_train = features.y_train X_test_u = features.X_test fe_id_u = 'fe006' run_id = 'run021' N_FEATURES = 300 X_train_u = Data.load(f'../input/pickle/X_train_{fe_id_u}.pkl') X_test_u = Data.load(f'../input/pickle/X_test_{fe_id_u}.pkl') fi = pd.read_csv(f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES] X_train_u = X_train_u[fi] X_test_u = X_test_u[fi].reset_index(drop=True) X_train_u.columns = [f'u_{c}' for c in fi] X_test_u.columns = [f'u_{c}' for c in fi] fe_id = 'fe_siavrez' X_train = Data.load(f'../input/pickle/X_train_{fe_id}.pkl') X_test = Data.load(f'../input/pickle/X_test_{fe_id}.pkl') print(X_train.shape, X_train_u.shape) print(X_test.shape, X_test_u.shape) X_train = pd.concat([X_train, X_train_u], axis=1)
def load_oof_from_run_id(run_id: str, to_rank: False): oof = Data.load(f'../output/pred/{run_id}-train.pkl') if to_rank: oof = rankdata(oof) / len(oof) return oof
def load_pred_from_run_id(run_id: str, to_rank: False): pred = Data.load(f'../output/pred/{run_id}-test.pkl') if to_rank: pred = rankdata(pred) / len(pred) return pred
def load_pred_from_run_id(run_id: str): pred = Data.load(f'../output/pred/{run_id}-test.pkl') if run_id in ('run013', 'run014', 'run015'): pred = pred.reshape(-1, ) return pred
def load_model(self) -> None: model_path = os.path.join("../output/model", f"{self.run_fold_name}.model") self.model = Data.load(model_path)
def train_fold(self, i_fold: int) -> Tuple[Any, Any, Any, Any]: """クロスバリデーションでのfoldを指定して学習・評価を行う 他のメソッドから呼び出すほか、単体でも確認やパラメータ調整に用いる :param i_fold: foldの番号(すべてのときには'all'とする) :return: (モデルのインスタンス、レコードのインデックス、予測値、評価によるスコア)のタプル """ # 学習データの読込 X_train = self.X_train y_train = self.y_train # 残差の設定 if self.advanced and "ResRunner" in self.advanced: oof = Data.load(self.advanced["ResRunner"]["oof"]) X_train["res"] = (y_train - oof).abs() # 学習データ・バリデーションデータをセットする tr_idx, va_idx = self.load_index_fold(i_fold) X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx] X_val, y_val = X_train.iloc[va_idx], y_train.iloc[va_idx] # 残差でダウンサンプリング if self.advanced and "ResRunner" in self.advanced: X_tr = X_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] y_tr = y_tr.loc[( X_tr["res"] < self.advanced["ResRunner"]["res_threshold"]).values] print(X_tr.shape) X_tr.drop("res", axis=1, inplace=True) X_val.drop("res", axis=1, inplace=True) # Pseudo Lebeling if self.advanced and "PseudoRunner" in self.advanced: y_test_pred = Data.load( self.advanced["PseudoRunner"]["y_test_pred"]) if "pl_threshold" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold"]) | (y_test_pred > 1 - self.advanced["PseudoRunner"]["pl_threshold"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) elif "pl_threshold_neg" in self.advanced["PseudoRunner"]: X_add = self.X_test.loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame(y_test_pred).loc[ (y_test_pred < self. advanced["PseudoRunner"]["pl_threshold_neg"]) | (y_test_pred > self. advanced["PseudoRunner"]["pl_threshold_pos"])] y_add = pd.DataFrame( ([1 if ya > 0.5 else 0 for ya in y_add[0]])) else: X_add = self.X_test y_add = pd.DataFrame(y_test_pred) print(f"added X_test: {len(X_add)}") X_tr = pd.concat([X_tr, X_add]) y_tr = pd.concat([y_tr, y_add]) # 学習を行う model = self.build_model(i_fold) model.train(X_tr, y_tr, X_val, y_val, self.X_test) # type: ignore # バリデーションデータへの予測・評価を行う pred_val = model.predict(X_val) if self.evaluation_metric == "log_loss": score = log_loss(y_val, pred_val, eps=1e-15, normalize=True) elif self.evaluation_metric == "mean_absolute_error": score = mean_absolute_error(y_val, pred_val) elif self.evaluation_metric == "rmse": score = np.sqrt(mean_squared_error(y_val, pred_val)) elif self.evaluation_metric == "auc": score = roc_auc_score(y_val, pred_val) elif self.evaluation_metric == "prauc": score = average_precision_score(y_val, pred_val) # モデル、インデックス、予測値、評価を返す return model, va_idx, pred_val, score
# sub = pd.read_csv('../input/atmaCup5__sample_submission.csv') # train = pd.read_csv('../input/train.csv') # test = pd.read_csv('../input/test.csv') # fitting = pd.read_csv('../input/fitting.csv') # train = pd.merge(train, fitting, on='spectrum_id', how='inner') # test = pd.merge(test, fitting, on='spectrum_id', how='inner') # train.to_csv('../input/train_fitting.csv', index=False) # test.to_csv('../input/test_fitting.csv', index=False) add_tr = pd.read_csv('../input/additional_features_train.csv') add_te = pd.read_csv('../input/additional_features_test.csv') fe005_tr = Data.load('../input/X_train_fe005.pkl') fe005_te = Data.load('../input/X_test_fe005.pkl') # fe001_tr = Data.load('../input/X_train_fe001.pkl') # fe001_te = Data.load('../input/X_test_fe001.pkl') # top10_tr = Data.load('../input/X_train_fe004_top10.pkl') # top10_te = Data.load('../input/X_test_fe004_top10.pkl') # top10_tr, top10_te = standerize(top10_tr, top10_te, {'encode_col': top10_tr.columns}) # print(top10_tr.head()) train_fitting_ef_add = pd.concat([fe005_tr, add_tr], axis=1) test_fitting_ef_add = pd.concat([fe005_te, add_te], axis=1) fe_name = 'fe005_add' Data.dump(train_fitting_ef_add, f'../input/X_train_{fe_name}.pkl')
def load_oof_from_run_id(run_id: str): oof = Data.load(f'../output/pred/{run_id}-train.pkl') if run_id in ('run013', 'run014', 'run015'): oof = oof.reshape(-1, ) return oof
from ayniy.utils import Data from sklearn.preprocessing import StandardScaler if __name__ == "__main__": fe_id = "fe000" fe_name = f"{fe_id}_nn_small" X_train = Data.load(f"../input/pickle/X_train_{fe_id}.pkl") y_train = Data.load(f"../input/pickle/y_train_{fe_id}.pkl") X_test = Data.load(f"../input/pickle/X_test_{fe_id}.pkl") del_col = [] for c in X_train.columns: X_train[c].fillna(-1, inplace=True) X_test[c].fillna(-1, inplace=True) try: prep = StandardScaler() X_train[c] = prep.fit_transform(X_train[[c]]) X_test[c] = prep.transform(X_test[[c]]) except ValueError: del_col.append(c) print(del_col) print(len(del_col)) X_train.drop(del_col, axis=1, inplace=True) X_test.drop(del_col, axis=1, inplace=True) print(X_train.shape) X_train = X_train.loc[:100] y_train = y_train.loc[:100]
ython select_features.py --n 100 """ import argparse import pandas as pd from ayniy.utils import Data parser = argparse.ArgumentParser() parser.add_argument('--n') args = parser.parse_args() fe_id = 'fe005' run_id = 'run046' N_FEATURES = int(args.n) fe_name = f'fe005_top{N_FEATURES}' X_train = Data.load(f'../input/X_train_{fe_id}.pkl') y_train = Data.load(f'../input/y_train_{fe_id}.pkl') X_test = Data.load(f'../input/X_test_{fe_id}.pkl') fi = pd.read_csv( f'../output/importance/{run_id}-fi.csv')['Feature'][:N_FEATURES] X_train = X_train[fi] X_test = X_test[fi] Data.dump(X_train, f'../input/X_train_{fe_name}.pkl') Data.dump(y_train, f'../input/y_train_{fe_name}.pkl') Data.dump(X_test, f'../input/X_test_{fe_name}.pkl')
import pandas as pd from ayniy.utils import Data if __name__ == '__main__': ef_tr = pd.read_csv('../input/efficient_tr.csv') ef_te = pd.read_csv('../input/efficient_te.csv') fe001_top500_tr = Data.load('../input/X_train_fe001_top500.pkl') fe001_top500_te = Data.load('../input/X_test_fe001_top500.pkl') train_tag = pd.concat([fe001_top500_tr, ef_tr], axis=1) test_tag = pd.concat([fe001_top500_te, ef_te], axis=1) fe_name = 'fe001_top500_ef' Data.dump(train_tag, f'../input/X_train_{fe_name}.pkl') Data.dump(test_tag, f'../input/X_test_{fe_name}.pkl')