def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger): with timer("read data"): meta = pd.read_hdf(meta_path, key="data") mat = h5sparse.File(mat_path, mode="r")["matrix"] with timer("split data"): if val: train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0] val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] else: train_ind = np.where(meta.is_test == 0)[0] val_ind = np.where(meta.is_test == 1)[0] logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min() : (train_ind.max() + 1)] X_val = mat[val_ind.min() : (val_ind.max() + 1)] del mat gc.collect() with timer("fit model"): model_instance.fit( X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values) ) joblib.dump(model_instance, model_path) val_pred = model_instance.predict(X_val) train_pred = model_instance.predict(X_train) logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred))) if val: logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred if val: logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) meta_val.to_csv(predictions_path, index=False)
def read_prediction_val(fn): p = pd.read_csv(fn) p.sort_values(["user_id", "session_id", "step"], inplace=True) p.reset_index(inplace=True, drop=True) mrr = mrr_fast(p, "click_proba") config_file = fn.replace("predictions.csv", "config.json") if os.path.exists(config_file) and config_file.endswith("config.json"): config = open(config_file).read() else: config = fn return mrr, p, config
def validate_models(self, n_users, n_debug=None): df_train, df_val = self.load_train_val(n_users, n_debug=n_debug) preds_mat = np.vstack([model.fit_and_predict(df_train, df_val, validate=True) for model in self.models]).T def opt_coefs(coefs): preds = preds_mat.dot(coefs) df_val["preds"] = preds mrr = mrr_fast(df_val, "preds") print(mrr, coefs) return -mrr best_coefs = fmin(opt_coefs, [model.weight for model in self.models]) best_coefs = fmin_powell(opt_coefs, best_coefs) preds = preds_mat.dot(best_coefs) df_val["click_proba"] = preds print("MRR {:4f}".format(mrr_fast(df_val, "click_proba"))) print("Best coefs: ", best_coefs)
train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind)) logger.info( f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min():(train_ind.max() + 1)] X_val = mat[val_ind.min():(val_ind.max() + 1)] del mat gc.collect() with timer("model fitting"): model = LGBMRanker(**BEST_PARAMS) model.fit(X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)) val_pred = model.predict(X_val) train_pred = model.predict(X_train) logger.info("Train AUC {:.4f}".format( roc_auc_score(meta_train["was_clicked"].values, train_pred))) logger.info("Val AUC {:.4f}".format( roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) githash = get_git_hash() meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False) joblib.dump(model, "model_val.joblib")
def opt_coefs(coefs): preds = preds_mat.dot(coefs) df_val["preds"] = preds mrr = mrr_fast(df_val, "preds") print(mrr, coefs) return -mrr
def evaluate(self, df_train, df_val, train_pred, val_pred): print("Train AUC {:.4f}".format(roc_auc_score(df_train["was_clicked"].values, train_pred))) print("Val AUC {:.4f}".format(roc_auc_score(df_val["was_clicked"].values, val_pred))) df_val["click_proba"] = val_pred print("Val MRR {:.4f}".format(mrr_fast(df_val, "click_proba")))
final = val_predictions[-1][2].copy() lengths = group_lengths(final["clickout_id"]) preds_stack = np.vstack([df["click_proba"] for _, _, df, _ in val_predictions]).T def opt(v): preds_ens = preds_stack.dot(v) mrr = mrr_fast_v3(final["was_clicked"].values, preds_ens, lengths) print(f"MRR {mrr}") return -mrr coefs = fmin(opt, [0] * preds_stack.shape[1]) coefs = fmin(opt, coefs, ftol=0.000_001) final["click_proba"] = preds_stack.dot(coefs) mrr = mrr_fast(final, "click_proba") mrr_str = f"{mrr:.4f}"[2:] print(mrr) mrrs, _, _, configs = list(zip(*val_predictions)) summary_df = pd.DataFrame({"config": configs, "mrr": mrrs, "coef": coefs}) print(summary_df) summary_df.to_csv(f"model_summary_{mrr_str}.csv") # read submission models with Pool(32) as pool: sub_predictions_dfs = pool.map(read_prediction, [fn for _, fn in preds_subs_all]) sub_predictions = [(hsh, df) for ((hsh, fn), df) in zip(preds_subs_all, sub_predictions_dfs) if hsh in val_hashes] for coef, (hsh, df) in zip(coefs, sub_predictions): print(coef, hsh, df["click_proba"].min(), df["click_proba"].max())
def mrr_metric(train_data, preds): mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values) return "error", mrr, True model = LGBMRanker(learning_rate=0.05, n_estimators=900, min_child_samples=5, min_child_weight=0.00001, n_jobs=-2) model.fit( mat_train, df_train["was_clicked"], group=group_lengths(df_train["clickout_id"]), # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1), verbose=True, eval_set=[(mat_val, df_val["was_clicked"])], eval_group=[group_lengths(df_val["clickout_id"])], eval_metric=mrr_metric, ) df_train["click_proba"] = model.predict(mat_train) df_val["click_proba"] = model.predict(mat_val) print(mrr_fast(df_val, "click_proba")) print("By rank") for n in range(1, 10): print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))