def do_cv_pred(train, test, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] test_x = test[ensemble_col].values.reshape(-1) train_y = train["target"] submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] reg = IsotonicRegression() X_train = X_train.values.reshape(-1) X_test = X_test.values.reshape(-1) reg.fit(X_train, y_train) valid_set_pred = reg.predict(X_test) print(y_test.describe()) temp = pd.DataFrame(valid_set_pred) print(temp.describe()) score = evaluator.rmse(y_test, valid_set_pred) print(score) y_pred = reg.predict(test_x) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
def do_cv_pred(train, test, files, use_cols=10, verbose=False): print("------- do preds --------") ensemble_col = [ f[1] for i, f in enumerate(files) if (i % 20) <= use_cols ] if use_cols == 2: print(ensemble_col) train_x = train[ensemble_col] test_x = test[ensemble_col] train_y = train["target"] submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] reg = BayesianRidge().fit(X_train, y_train) valid_set_pred = reg.predict(X_test) score = evaluator.rmse(y_test, valid_set_pred) if verbose: print(reg.coef_) print(score) y_pred = reg.predict(test_x) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
def do_cv_pred(train, test, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] + ["has_new"] train_x = train[ensemble_col] test_x = test[ensemble_col] train_y = train["target"] submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] lgb = pocket_lgb.ShallowLgb() for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] model = lgb.do_train_direct(X_train, X_test, y_train, y_test) valid_set_pred = model.predict(X_test) score = evaluator.rmse(y_test, valid_set_pred) print(score) y_pred = model.predict(test_x) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) if idx == 0: lgb.show_feature_importance(model) train_output = pd.concat(train_preds, axis=0) submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
def do_preds(train, test, files): train = train.sort_values(by="bin", ascending=False) test = test.sort_values(by="bin", ascending=False) train["bin"] = train["bin"] * -33.219281 test["bin"] = test["bin"] * -33.219281 print(train.head()) print(train.tail()) print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col].values.reshape(-1) reg = IsotonicRegression() reg.fit(train_x, train["target"]) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) test_x = test[ensemble_col].values.reshape(-1) y_pred = reg.predict(test_x) sub = pd.DataFrame() sub["card_id"] = test["card_id"] sub["target"] = y_pred print(train["target"].describe()) # print(train["big"].describe()) print(sub["target"].describe()) sub.to_csv(path_const.OUTPUT_ENS, index=False)
def do_preds(train, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] reg = Ridge().fit(train_x, train["target"]) print(reg.coef_) sig_idx = list() for idx, coef in enumerate(reg.coef_): if coef > 0.15: sig_idx.append(idx) print(sig_idx) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) return sig_idx
def do_preds(train, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] reg = Ridge().fit(train_x, train["target"]) print(reg.coef_) sig_idx = list() for idx, coef in enumerate(reg.coef_): sig_idx.append((idx, coef)) sorted_idx = sorted(sig_idx, key=lambda tup: abs(tup[1]), reverse=True) for i in sorted_idx: print(i) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) return sorted_idx
def do_preds(train, test, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] reg = BayesianRidge().fit(train_x, train["target"]) print(reg.coef_) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) test_x = test[ensemble_col] y_pred = reg.predict(test_x) sub = pd.DataFrame() sub["card_id"] = test["card_id"] sub["target"] = y_pred print(train["target"].describe()) # print(train["big"].describe()) print(sub["target"].describe()) sub.to_csv(path_const.OUTPUT_ENS, index=False)
def get_cv_score(param): local_timer = pocket_timer.GoldenTimer(logger) lgb = pocket_lgb.OptLgb(param) for train_index, test_index in skf.split(train, outliers): X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index] model = lgb.do_train_direct(X_train, X_test, y_train, y_test) valid_set_pred = model.predict(X_test) train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) local_timer.time("end train in ") train_output = pd.merge(train_output, train, on="card_id", how="left") score = evaluator.rmse(train_output["target"], train_output["cv_pred"]) return score
def do_preds(train, test, files): print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] reg = Ridge().fit(train_x, train["target"]) print(reg.coef_) sig_idx = list() for idx, coef in enumerate(reg.coef_): if coef > 0.15: sig_idx.append(idx) print(sig_idx) y_pred = reg.predict(train_x) score = evaluator.rmse(train["target"], y_pred) print(score) test_x = test[ensemble_col] y_pred = reg.predict(test_x) sub = pd.DataFrame() sub["card_id"] = test["card_id"] sub["target"] = y_pred print(train["target"].describe()) # print(train["big"].describe()) print(sub["target"].describe()) sub.to_csv(path_const.OUTPUT_ENS, index=False)
def print_score(train, files): for f in files: score = evaluator.rmse(train["target"], train[f[1]]) print(score)
timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() data = input_loader.GoldenLoader().load_small_pred_new() train, test, train_x, train_y, test_x = data print(train_x.shape) print(train_y.shape) print(test_x.shape) print(train_y.describe()) train_y = train_y.fillna(60) print(train_x.columns) timer.time("load csv") mean_val = 6.040527 train_x["mean_val"] = mean_val rmse_score = evaluator.rmse(train_y, train_x["mean_val"]) print(rmse_score) pred_col_name = "pred_new" submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission[pred_col_name] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv[pred_col_name] = 0 outliers = (train["target"] < -30).astype(int).values bagging_num = 1 split_num = 5 random_state = 4590 for bagging_index in range(bagging_num):
def merge_it(org, df): ret_df = pd.merge(org, df, on="card_id", how="left") return ret_df new = csv_io.read_file(path_const.FEAT_FROM_TS_NEW) old = csv_io.read_file(path_const.FEAT_FROM_TS_OLD) new2 = csv_io.read_file(path_const.FEAT_FROM_TS_NEW2) old2 = csv_io.read_file(path_const.FEAT_FROM_TS_OLD2) train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id", "target"]] train = merge_it(train, new) train = merge_it(train, old) train = merge_it(train, new2) train = merge_it(train, old2) eval_cols = [ 'pred_from_new_ts_mean', 'pred_from_old_ts_mean', 'pred_from_new_ts2_mean', 'pred_from_old_ts2_mean', ] print(train.describe()) print(train.shape) train = train[train["pred_from_new_ts2_mean"].notnull()] print(train.shape) for c in eval_cols: score = evaluator.rmse(train["target"], train[c]) print(c, "=", score)
sub_with_outlier.loc[index, 'target'] = sub_with_outlier.loc[index, 'target'] - 5 index = (sub_with_outlier['target2'] < -10) & (sub_with_outlier['target2'] > -15) & (sub_with_outlier['target'] != outlier_value) sub_with_outlier.loc[index, 'target'] = sub_with_outlier.loc[index, 'target'] - 1 index = (logistic_test['target'] > 0.94) & (sub_with_outlier['target'] != outlier_value) sub_with_outlier.loc[index, 'target'] = outlier_value index = sub_with_outlier['target'] <= outlier_value sub_with_outlier.loc[index, 'target'] = outlier_value sub_with_outlier[['card_id', 'target']].to_csv("../post/first_pp.csv", index=False) print(sub_with_outlier.shape) print(sub_with_outlier.describe()) _train = train[["card_id", "target"]].copy() _train.columns = ["card_id", "y_true"] sub_with_outlier = pd.merge(sub_with_outlier, _train, on="card_id", how="inner") print(sub_with_outlier.shape) score = evaluator.rmse(sub_with_outlier["target"], sub_with_outlier["y_true"]) print(score)
def do_cv_pred(train, test, files): train_has_new_idx = train[train["has_new"] == 1].index train_no_new_idx = train[train["has_new"] == 0].index test_has_new_idx = test[test["has_new"] == 1].index test_no_new_idx = test[test["has_new"] == 0].index print("------- do preds --------") ensemble_col = [f[1] for f in files] train_x = train[ensemble_col] test_x = test[ensemble_col] has_new_test_x = test_x.iloc[test_has_new_idx] no_new_test_x = test_x.iloc[test_no_new_idx] train_y = train["target"] has_new_sub = pd.DataFrame() has_new_sub["card_id"] = test[test["has_new"] == 1]["card_id"] has_new_sub["target"] = 0 no_new_sub = pd.DataFrame() no_new_sub["card_id"] = test[test["has_new"] == 0]["card_id"] no_new_sub["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): # has new _train_idx = [i for i in train_index if i in train_has_new_idx] _test_idx = [i for i in test_index if i in train_has_new_idx] X_train, X_test = train_x.iloc[_train_idx], train_x.iloc[_test_idx] y_train, y_test = train_y.iloc[_train_idx], train_y.iloc[_test_idx] reg = BayesianRidge().fit(X_train, y_train) print(reg.coef_) has_new_v_pred = reg.predict(X_test) score = evaluator.rmse(y_test, has_new_v_pred) print(score) has_new_y_pred = reg.predict(has_new_test_x) has_new_sub["target"] = has_new_sub["target"] + has_new_y_pred train_id = train.iloc[_test_idx] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = has_new_v_pred train_preds.append(train_cv_prediction) # no new _train_idx = [i for i in train_index if i in train_no_new_idx] _test_idx = [i for i in test_index if i in train_no_new_idx] X_train, X_test = train_x.iloc[_train_idx], train_x.iloc[_test_idx] y_train, y_test = train_y.iloc[_train_idx], train_y.iloc[_test_idx] reg = BayesianRidge().fit(X_train, y_train) print(reg.coef_) no_new_v_pred = reg.predict(X_test) score = evaluator.rmse(y_test, no_new_v_pred) print(score) no_new_y_pred = reg.predict(no_new_test_x) no_new_sub["target"] = no_new_sub["target"] + no_new_y_pred train_id = train.iloc[_test_idx] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = no_new_v_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) has_new_sub["target"] = has_new_sub["target"] / split_num no_new_sub["target"] = no_new_sub["target"] / split_num submission = pd.concat([has_new_sub, no_new_sub], axis=0) submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
train3.columns = ["card_id", "mlp"] # train4.columns = ["card_id", "mlp4"] train5.columns = ["card_id", "mlp_rank"] train = pd.merge(train, train1, on="card_id", how="inner") train = pd.merge(train, train2, on="card_id", how="inner") train = pd.merge(train, train3, on="card_id", how="inner") # train = pd.merge(train, train4, on="card_id", how="inner") train = pd.merge(train, train5, on="card_id", how="inner") print(train.shape) print("-----") print("co-eff...") print(train[["target", "big", "small", "mlp", "mlp4", "mlp_rank"]].corr()) print("before score..") score = evaluator.rmse(train["target"], train["big"]) print(score) score = evaluator.rmse(train["target"], train["small"]) print(score) score = evaluator.rmse(train["target"], train["mlp"]) print(score) score = evaluator.rmse(train["target"], train["mlp4"]) print(score) score = evaluator.rmse(train["target"], train["mlp_rank"]) print(score) print("-----") ensemble_col = ["big", "small", "mlp", "mlp4", "mlp_rank"] train_x = train[ensemble_col] reg = LinearRegression().fit(train_x, train["target"]) print(reg.coef_)
}) train_out_pred = pd.DataFrame({ "card_id": _outlier["card_id"], "cv_pred": outlier_pred }) train_cv_prediction = pd.concat([train_non_out_pred, train_out_pred], axis=0) train_preds.append(train_cv_prediction) timer.time("done one set in") lgb.show_feature_importance(models[0], path_const.FEATURE_GAIN) avg_score = str(total_score / split_num) logger.print("average score= " + avg_score) timer.time("end train in ") submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output = pd.concat(train_preds, axis=0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) y_true = train["target"] y_pred = train_output["cv_pred"] rmse_score = evaluator.rmse(y_true, y_pred) logger.print("evaluator rmse score= " + str(rmse_score)) print(train["target"].describe()) logger.print(train_output.describe()) logger.print(submission.describe()) timer.time("done submission in ")
from elo.common import pocket_timer, pocket_logger, pocket_file_io, path_const from elo.common import pocket_lgb, evaluator from elo.utils import drop_col_util from sklearn import model_selection logger = pocket_logger.get_my_logger() timer = pocket_timer.GoldenTimer(logger) csv_io = pocket_file_io.GoldenCsv() train = csv_io.read_file(path_const.ORG_TRAIN)[["card_id", "target"]] train1 = csv_io.read_file("../sub/big_oof.csv") test1 = csv_io.read_file("../sub/big_sub.csv") train2 = csv_io.read_file("../sub/small_oof.csv") test2 = csv_io.read_file("../sub/small_sub.csv") timer.time("load csv in ") train1.columns = ["card_id", "big"] train2.columns = ["card_id", "small"] train = pd.merge(train, train1, on="card_id", how="inner") train = pd.merge(train, train2, on="card_id", how="inner") train["avg"] = (train["big"] + train["small"]) / 2 score = evaluator.rmse(train["target"], train["big"]) print(score) score = evaluator.rmse(train["target"], train["small"]) print(score) score = evaluator.rmse(train["target"], train["avg"]) print(score)
def try_some(train, test): train_x, test_x = train[pred_col], test[pred_col] train_y = train["target"] print(train_x.shape) print(train_y.shape) print(test_x.shape) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values bagging_num = 1 split_num = 5 random_state = 4590 for bagging_index in range(bagging_num): skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=random_state) logger.print("random_state=" + str(random_state)) lgb = pocket_lgb.GoldenLgb() total_score = 0 models = [] train_preds = [] for train_index, test_index in skf.split(train, outliers): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] model = lgb.do_train_direct(X_train, X_test, y_train, y_test) score = model.best_score["valid_0"]["rmse"] total_score += score y_pred = model.predict(test_x) valid_set_pred = model.predict(X_test) models.append(model) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) timer.time("done one set in") train_output = pd.concat(train_preds, axis=0) train_cv["cv_pred"] += train_output["cv_pred"] lgb.show_feature_importance(models[0], path_const.FEATURE_GAIN) avg_score = str(total_score / split_num) logger.print("average score= " + avg_score) timer.time("end train in ") submission["target"] = submission["target"] / (bagging_num * split_num) submission.to_csv(path_const.OUTPUT_SUB, index=False) train_cv["cv_pred"] = train_cv["cv_pred"] / bagging_num train_cv.to_csv(path_const.OUTPUT_OOF, index=False) y_true = train_y y_pred = train_cv["cv_pred"] rmse_score = evaluator.rmse(y_true, y_pred) logger.print("evaluator rmse score= " + str(rmse_score)) print(train["target"].describe()) logger.print(train_cv.describe()) logger.print(submission.describe()) timer.time("done submission in ")
def do_cv(self, data): for d in data: print(d.shape) train, test, train_x, train_y, test_x = data timer = pocket_timer.GoldenTimer(self.logger) submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 train_cv = pd.DataFrame() train_cv["card_id"] = train["card_id"] train_cv["cv_pred"] = 0 outliers = (train["target"] < -30).astype(int).values bagging_num = 1 split_num = 5 for bagging_index in range(bagging_num): skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) total_score = 0 train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): lr_schedule = learning_rate.GoldenLearningRate(0.1, 50).cosine_annealing_scheduler() mlp = pocket_network2.GoldenMlp2(self.epochs, self.batch_size, lr_schedule) network = mlp.build_model(train_x.shape[1]) X_train, X_test = train_x.iloc[train_index], train_x.iloc[test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index] print("start train") model, history = mlp.do_train_direct(str(idx), network, X_train, X_test, y_train, y_test) mlp.save_history(history, str(idx)) print('Loading Best Model') model.load_weights(path_const.get_weight_file(str(idx))) y_pred = model.predict(test_x, batch_size=self.batch_size) y_pred = np.reshape(y_pred, -1) y_pred = np.clip(y_pred, -33.219281, 18.0) valid_set_pred = model.predict(X_test, batch_size=self.batch_size) score = evaluator.rmse(y_test, valid_set_pred) print(score) total_score += score submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) timer.time("done one set in") train_output = pd.concat(train_preds, axis=0) train_cv["cv_pred"] += train_output["cv_pred"] avg_score = str(total_score / split_num) self.logger.print("average score= " + avg_score) timer.time("end train in ") submission["target"] = submission["target"] / (bagging_num * split_num) # submission["target"] = np.clip(submission["target"], -33.219281, 18.0) submission.to_csv(path_const.OUTPUT_SUB, index=False) train_cv["cv_pred"] = train_cv["cv_pred"] / bagging_num train_cv["cv_pred"] = np.clip(train_cv["cv_pred"], -33.219281, 18.0) train_cv.to_csv(path_const.OUTPUT_OOF, index=False) y_true = train_y y_pred = train_cv["cv_pred"] rmse_score = evaluator.rmse(y_true, y_pred) self.logger.print("evaluator rmse score= " + str(rmse_score)) print(train["target"].describe()) self.logger.print(train_cv.describe()) self.logger.print(submission.describe()) timer.time("done submission in ")