def main(): logger.debug('config: {}'.format(options.config)) logger.debug(feats) logger.debug(model_params) # 指定した特徴量からデータをロード X_train_all, X_test = load_datasets(feats) y_train_all = load_target(target_name) cols = X_train_all.columns # stacking if "stacking" in config and config["stacking"] == True: oof_df, test_df = stack_load_df(config["stacking_name"]) X_train_all = pd.concat([X_train_all, oof_df], axis=1) X_test = pd.concat([X_test, test_df], axis=1) if (model_name != "lightgbm") or ("sampling" in config): logger.debug("rank gauss") scaler = QuantileTransformer(n_quantiles=100, random_state=model_params["seed"], output_distribution="normal") all_df = pd.concat([X_train_all, X_test]) all_df = all_df.fillna(all_df.median()) # 欠損値埋め all_df[cols] = scaler.fit_transform(all_df[cols]) # scale X_train_all = all_df[:X_train_all.shape[0]].reset_index(drop=True) X_test = all_df[X_train_all.shape[0]:].reset_index(drop=True) logger.debug("X_train_all shape: {}".format(X_train_all.shape)) print(X_train_all.info()) # seed ごとにループ class_cols = [i for i in range(model_params["num_class"])] oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0])], columns=class_cols) sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) oof_df[class_cols] = 0 sub[target_name] = 0 for seed_num in range(config["seed_num"]): logger.debug(f"SEED: {seed_num}") one_oof_df, one_sub = train_and_predict(X_train_all, y_train_all, X_test, seed_num=seed_num) oof_df[class_cols] += one_oof_df[class_cols] / config["seed_num"] sub[target_name] += one_sub[target_name] / config["seed_num"] auc_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "auc") acc_score = evaluate_score(y_train_all.values, oof_df.values.argmax(axis=1), "acc") logloss_score = evaluate_score(y_train_all.values, oof_df.values[:, 1], "logloss") logger.debug('=== OOF CV scores ===') logger.debug( f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}") sub = sub.rename(columns={ID_name: 'Id', target_name: "label"}) oof_df.to_csv(f'./data/output/oof_{config_filename}.csv', index=False) sub.to_csv(f'./data/output/sub_{config_filename}.csv', index=False)
def train_and_predict_linear(X_train_all, y_train_all, X_test): qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False) # 学習前にy_trainに、log(y+1)で変換 y_train_all = np.log(y_train_all + 1) # np.log1p() でもOK y_preds = [] scores = [] # CVスコア for seed in SEED: kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) if config['model'] == "LinearRegression": lr = LinearRegressionWrapper() elif config['model'] == "Lasso": lr = LassoWrapper() elif config['model'] == "Ridge": lr = RidgeWrapper() elif config['model'] == "ElasticNet": lr = ElasticNetWrapper() elif config['model'] == "KernelRidge": lr = KernelRidgeWrapper() elif config['model'] == "SVR": lr = SVRWrapper() elif config['model'] == "XGBoost": lr = XGBoost() elif config['model'] == "RandomForest": lr = RandomForestWrapper() elif config['model'] == "GradientBoosting": lr = GradientBoostingRegressorWrapper() elif config['model'] == "CatBoost": lr = CatBoost() y_pred, y_valid_pred, m = lr.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, params) # 結果の保存 y_preds.append(y_pred) # スコア rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss']) logging.debug(f"\tscore: {rmse_valid}") scores.append(rmse_valid) score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format( config['model'], now, score), index=False)
def train_and_predict(X_train_all, y_train_all, X_test, seed_num): model_params["seed"] = seed + seed_num oof_df = pd.DataFrame( index=[i for i in range(X_train_all.shape[0])], columns=[i for i in range(model_params["num_class"])]) y_preds = [] models = [] auc_scores = [] acc_scores = [] logloss_scores = [] kf = StratifiedKFold(n_splits=config["fold"], shuffle=True, random_state=model_params["seed"]) for fold_num, (train_index, valid_index) in enumerate(kf.split(X_train_all, y_train_all)): logger.debug(f"FOLD: {fold_num}") X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) # train & inference if model_name == "lightgbm": classifier = LightGBM() elif model_name == "nn": classifier = NeuralNet(seed_num, fold_num) elif model_name == "cnn1d": classifier = CNN1d(seed_num, fold_num) elif model_name == "logistic_regression": classifier = LogisticRegressionClassifier() else: logger.debug("No such model name") raise Exception if "sampling" in config: if config["sampling"] == "SMOTE": X_train, y_train = SMOTE().fit_resample(X_train, y_train) elif config["sampling"] == "ADASYN": X_train, y_train = ADASYN().fit_resample(X_train, y_train) elif config["sampling"] == "RandomOverSampler": X_train, y_train = RandomOverSampler().fit_resample( X_train, y_train) else: raise y_pred, y_valid_pred, model = classifier.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, model_params) # 結果の保存 y_preds.append(y_pred) oof_df.iloc[valid_index, :] = y_valid_pred models.append(model) # スコア auc_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "auc") acc_valid = evaluate_score(y_valid, y_valid_pred.argmax(axis=1), "acc") logloss_valid = evaluate_score(y_valid, y_valid_pred[:, 1], "logloss") logger.debug( f"\t auc:{auc_valid}, acc: {acc_valid}, logloss: {logloss_valid}") auc_scores.append(auc_valid) acc_scores.append(acc_valid) logloss_scores.append(logloss_valid) # lightgbmなら重要度の出力 if model_name == "lightgbm": feature_imp_np = np.zeros(X_train_all.shape[1]) for model in models: feature_imp_np += model.feature_importance() / len(models) feature_imp = pd.DataFrame(sorted( zip(feature_imp_np, X_train_all.columns)), columns=['Value', 'Feature']) #print(feature_imp) logger.debug(feature_imp) plt.figure(figsize=(20, 10)) sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)) plt.title('LightGBM Features (avg over folds)') plt.tight_layout() plt.savefig(f'./logs/plots/features_{config_filename}.png') # CVスコア auc_score = sum(auc_scores) / len(auc_scores) acc_score = sum(acc_scores) / len(acc_scores) logloss_score = sum(logloss_scores) / len(logloss_scores) logger.debug('=== CV scores ===') logger.debug( f"\t auc:{auc_score}, acc: {acc_score}, logloss: {logloss_score}") # submitファイルの作成 sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) sub[target_name] = y_sub[:, 1] ''' 確率ではなく番号を出力 if y_sub.shape[1] > 1: y_sub = np.argmax(y_sub, axis=1) ''' return oof_df, sub
def stacking(X_train_all, y_train_all, X_test): qcut_target = pd.qcut(y_train_all, SK_NUM, labels=False) print(qcut_target) # 学習前にy_trainに、log(y+1)で変換 y_train_all = np.log(y_train_all + 1) # np.log1p() でもOK # base model の学習 base_models = config['base_models'] # 行数を揃えた空のデータフレームを作成 oof_df = pd.DataFrame(index=[i for i in range(X_train_all.shape[0]) ]) # meta model の X_train に y_preds_df = pd.DataFrame(index=[i for i in range(X_test.shape[0]) ]) # meta model の X_test に # base model ごとにK-fold して学習 for name, json_name in base_models.items(): one_config = json.load(open(f"./configs/{json_name}")) oof = np.zeros((X_train_all.shape[0], 1)) #y_preds = np.zeros((X_test.shape[0], 1)) y_preds = [] scores = [] for seed in SEED: kf = StratifiedKFold(n_splits=BASE_FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (X_train_all.iloc[train_index, :], X_train_all.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) if name == "LightGBM": model = LightGBM() elif name == "LinearRegression": model = LinearRegressionWrapper() elif name == "Lasso": model = LassoWrapper() elif name == "Ridge": model = RidgeWrapper() elif name == "ElasticNet": model = ElasticNetWrapper() elif name == "KernelRidge": model = KernelRidgeWrapper() elif name == "SVR": model = SVRWrapper() elif name == "XGBoost": model = XGBoost() elif name == "RandomForest": model = RandomForestWrapper() elif name == "GradientBoosting": model = GradientBoostingRegressorWrapper() elif name == "CatBoost": model = CatBoost() y_pred, y_valid_pred, m = model.train_and_predict( X_train, X_valid, y_train, y_valid, X_test, one_config["params"]) oof[valid_index, :] += y_valid_pred.reshape( len(y_valid_pred), 1) / len(SEED) #y_preds += (y_pred / FOLDS) y_preds.append(y_pred) # スコア rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss']) logging.debug(f"\tmodel:{name}, score: {rmse_valid}") scores.append(rmse_valid) score = sum(scores) / len(scores) print('===CV scores===') print(f"\tmodel: {name}, scores: {scores}") print(f"\tmodel: {name}, score: {score}") logging.debug('===CV scores===') logging.debug(f"\tmodel: {name}, scores: {scores}") logging.debug(f"\tmodel: {name}, score: {score}") oof_df[name] = oof y_preds_df[name] = sum(y_preds) / len(y_preds) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = y_preds_df.mean(axis=1) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_blend.csv', index=False) # meta model の学習 # use_features_in_secondary = True oof_df = pd.concat([X_train_all, oof_df], axis=1) y_preds_df = pd.concat([X_test, y_preds_df], axis=1) y_preds = [] scores = [] for seed in SEED: kf = StratifiedKFold(n_splits=META_FOLDS, shuffle=True, random_state=seed) for train_index, valid_index in kf.split(X_train_all, qcut_target): X_train, X_valid = (oof_df.iloc[train_index, :], oof_df.iloc[valid_index, :]) y_train, y_valid = (y_train_all.iloc[train_index], y_train_all.iloc[valid_index]) name = config['meta_model'] if name == "LightGBM": model = LightGBM() elif name == "LinearRegression": model = LinearRegressionWrapper() elif name == "Lasso": model = LassoWrapper() elif name == "Ridge": model = RidgeWrapper() elif name == "ElasticNet": model = ElasticNetWrapper() elif name == "KernelRidge": model = KernelRidgeWrapper() elif name == "SVR": model = SVRWrapper() elif name == "XGBoost": model = XGBoost() elif name == "RandomForest": model = RandomForestWrapper() elif name == "GradientBoosting": model = GradientBoostingRegressorWrapper() elif name == "CatBoost": model = CatBoost() # 学習と推論。 y_preds_df を X_test に使用する y_pred, y_valid_pred, m = model.train_and_predict( X_train, X_valid, y_train, y_valid, y_preds_df, params) # 結果の保存 y_preds.append(y_pred) # スコア rmse_valid = evaluate_score(y_valid, y_valid_pred, config['loss']) logging.debug(f"\tscore: {rmse_valid}") scores.append(rmse_valid) score = sum(scores) / len(scores) print('===CV scores===') print(scores) print(score) logging.debug('===CV scores===') logging.debug(scores) logging.debug(score) # submitファイルの作成 ID_name = config['ID_name'] sub = pd.DataFrame(pd.read_feather(f'data/interim/test.feather')[ID_name]) y_sub = sum(y_preds) / len(y_preds) # 最後に、予測結果に対しexp(y)-1で逆変換 y_sub = np.exp(y_sub) - 1 # np.expm1() でもOK sub[target_name] = y_sub sub.to_csv('./data/output/sub_{0}_{1:%Y%m%d%H%M%S}_{2}.csv'.format( config['model'], now, score), index=False)