def fit_and_predict(self, df_train, df_val, validate=False): with timer("vectorizing train"): mat_train = self.vectorizer.fit_transform(df_train) print("Train shape", mat_train.shape) with timer("vectorinzg val"): mat_val = self.vectorizer.transform(df_val) print("Val shape", mat_val.shape) with timer("fitting model"): if isinstance(self.model, LGBMRanker): self.model.fit( mat_train, df_train["was_clicked"].values, group=group_lengths(df_train["clickout_id"].values) ) else: self.model.fit(mat_train, df_train["was_clicked"].values) if self.is_prob: val_pred = self.model.predict_proba(mat_val)[:, 1] if validate: train_pred = self.model.predict_proba(mat_train)[:, 1] self.evaluate(df_train, df_val, train_pred, val_pred) else: print("Predicting validation") val_pred = self.model.predict(mat_val) if validate: print("Predicting train") train_pred = self.model.predict(mat_train) self.evaluate(df_train, df_val, train_pred, val_pred) self.save_predictions(df_val, val_pred, validate) return val_pred
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger): with timer("read data"): meta = pd.read_hdf(meta_path, key="data") mat = h5sparse.File(mat_path, mode="r")["matrix"] with timer("split data"): if val: train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0] val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] else: train_ind = np.where(meta.is_test == 0)[0] val_ind = np.where(meta.is_test == 1)[0] logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min() : (train_ind.max() + 1)] X_val = mat[val_ind.min() : (val_ind.max() + 1)] del mat gc.collect() with timer("fit model"): model_instance.fit( X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values) ) joblib.dump(model_instance, model_path) val_pred = model_instance.predict(X_val) train_pred = model_instance.predict(X_train) logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred))) if val: logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred if val: logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) meta_val.to_csv(predictions_path, index=False)
train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind)) logger.info( f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min():(train_ind.max() + 1)] X_val = mat[val_ind.min():(val_ind.max() + 1)] del mat gc.collect() with timer("model fitting"): model = LGBMRanker(**BEST_PARAMS) model.fit(X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)) val_pred = model.predict(X_val) train_pred = model.predict(X_train) logger.info("Train AUC {:.4f}".format( roc_auc_score(meta_train["was_clicked"].values, train_pred))) logger.info("Val AUC {:.4f}".format( roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) githash = get_git_hash() meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False) joblib.dump(model, "model_val.joblib")
with Pool(32) as pool: val_predictions_dfs = pool.map(read_prediction_val, [fn for _, fn in preds_vals_all]) val_predictions = [ (mrr, hsh, df, config) for ((hsh, fn), (mrr, df, config)) in zip(preds_vals_all, val_predictions_dfs) if (df.shape[0] == 3_077_674) and (mrr > 0.68) and ("160357" not in fn) and ("59629" not in fn) ] val_hashes = [p[1] for p in val_predictions] print("Debuging click probas") for mrr, hsh, df, _ in val_predictions: print(mrr, hsh, df["click_proba"].min(), df["click_proba"].max()) final = val_predictions[-1][2].copy() lengths = group_lengths(final["clickout_id"]) preds_stack = np.vstack([df["click_proba"] for _, _, df, _ in val_predictions]).T def opt(v): preds_ens = preds_stack.dot(v) mrr = mrr_fast_v3(final["was_clicked"].values, preds_ens, lengths) print(f"MRR {mrr}") return -mrr coefs = fmin(opt, [0] * preds_stack.shape[1]) coefs = fmin(opt, coefs, ftol=0.000_001) final["click_proba"] = preds_stack.dot(coefs) mrr = mrr_fast(final, "click_proba") mrr_str = f"{mrr:.4f}"[2:] print(mrr)
def mrr_metric(train_data, preds): mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values) return "error", mrr, True model = LGBMRanker(learning_rate=0.05, n_estimators=900, min_child_samples=5, min_child_weight=0.00001, n_jobs=-2) model.fit( mat_train, df_train["was_clicked"], group=group_lengths(df_train["clickout_id"]), # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1), verbose=True, eval_set=[(mat_val, df_val["was_clicked"])], eval_group=[group_lengths(df_val["clickout_id"])], eval_metric=mrr_metric, ) df_train["click_proba"] = model.predict(mat_train) df_val["click_proba"] = model.predict(mat_val) print(mrr_fast(df_val, "click_proba")) print("By rank") for n in range(1, 10): print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))