Beispiel #1
0
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger):
    with timer("read data"):
        meta = pd.read_hdf(meta_path, key="data")
        mat = h5sparse.File(mat_path, mode="r")["matrix"]

    with timer("split data"):
        if val:
            train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0]
            val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
        else:
            train_ind = np.where(meta.is_test == 0)[0]
            val_ind = np.where(meta.is_test == 1)[0]

        logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
        meta_train = meta.iloc[train_ind]
        meta_val = meta.iloc[val_ind]
        X_train = mat[train_ind.min() : (train_ind.max() + 1)]
        X_val = mat[val_ind.min() : (val_ind.max() + 1)]
        del mat
        gc.collect()

    with timer("fit model"):
        model_instance.fit(
            X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)
        )
        joblib.dump(model_instance, model_path)
        val_pred = model_instance.predict(X_val)
        train_pred = model_instance.predict(X_train)
        logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred)))
        if val:
            logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred)))
        meta_val["click_proba"] = val_pred
        if val:
            logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
        meta_val.to_csv(predictions_path, index=False)
Beispiel #2
0
def read_prediction_val(fn):
    p = pd.read_csv(fn)
    p.sort_values(["user_id", "session_id", "step"], inplace=True)
    p.reset_index(inplace=True, drop=True)
    mrr = mrr_fast(p, "click_proba")
    config_file = fn.replace("predictions.csv", "config.json")
    if os.path.exists(config_file) and config_file.endswith("config.json"):
        config = open(config_file).read()
    else:
        config = fn
    return mrr, p, config
Beispiel #3
0
    def validate_models(self, n_users, n_debug=None):
        df_train, df_val = self.load_train_val(n_users, n_debug=n_debug)

        preds_mat = np.vstack([model.fit_and_predict(df_train, df_val, validate=True) for model in self.models]).T

        def opt_coefs(coefs):
            preds = preds_mat.dot(coefs)
            df_val["preds"] = preds
            mrr = mrr_fast(df_val, "preds")
            print(mrr, coefs)
            return -mrr

        best_coefs = fmin(opt_coefs, [model.weight for model in self.models])
        best_coefs = fmin_powell(opt_coefs, best_coefs)

        preds = preds_mat.dot(best_coefs)
        df_val["click_proba"] = preds
        print("MRR {:4f}".format(mrr_fast(df_val, "click_proba")))
        print("Best coefs: ", best_coefs)
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))
    logger.info(
        f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
    meta_train = meta.iloc[train_ind]
    meta_val = meta.iloc[val_ind]
    X_train = mat[train_ind.min():(train_ind.max() + 1)]
    X_val = mat[val_ind.min():(val_ind.max() + 1)]
    del mat
    gc.collect()

with timer("model fitting"):
    model = LGBMRanker(**BEST_PARAMS)
    model.fit(X_train,
              meta_train["was_clicked"].values,
              group=group_lengths(meta_train["clickout_id"].values))
    val_pred = model.predict(X_val)
    train_pred = model.predict(X_train)
    logger.info("Train AUC {:.4f}".format(
        roc_auc_score(meta_train["was_clicked"].values, train_pred)))
    logger.info("Val AUC {:.4f}".format(
        roc_auc_score(meta_val["was_clicked"].values, val_pred)))
    meta_val["click_proba"] = val_pred
    logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
    githash = get_git_hash()
    meta_val.to_csv(f"predictions/model_val_{githash}.csv", index=False)
    joblib.dump(model, "model_val.joblib")
Beispiel #5
0
 def opt_coefs(coefs):
     preds = preds_mat.dot(coefs)
     df_val["preds"] = preds
     mrr = mrr_fast(df_val, "preds")
     print(mrr, coefs)
     return -mrr
Beispiel #6
0
 def evaluate(self, df_train, df_val, train_pred, val_pred):
     print("Train AUC {:.4f}".format(roc_auc_score(df_train["was_clicked"].values, train_pred)))
     print("Val AUC {:.4f}".format(roc_auc_score(df_val["was_clicked"].values, val_pred)))
     df_val["click_proba"] = val_pred
     print("Val MRR {:.4f}".format(mrr_fast(df_val, "click_proba")))
Beispiel #7
0
    final = val_predictions[-1][2].copy()

    lengths = group_lengths(final["clickout_id"])
    preds_stack = np.vstack([df["click_proba"] for _, _, df, _ in val_predictions]).T

    def opt(v):
        preds_ens = preds_stack.dot(v)
        mrr = mrr_fast_v3(final["was_clicked"].values, preds_ens, lengths)
        print(f"MRR {mrr}")
        return -mrr

    coefs = fmin(opt, [0] * preds_stack.shape[1])
    coefs = fmin(opt, coefs, ftol=0.000_001)

    final["click_proba"] = preds_stack.dot(coefs)
    mrr = mrr_fast(final, "click_proba")
    mrr_str = f"{mrr:.4f}"[2:]
    print(mrr)

    mrrs, _, _, configs = list(zip(*val_predictions))
    summary_df = pd.DataFrame({"config": configs, "mrr": mrrs, "coef": coefs})
    print(summary_df)
    summary_df.to_csv(f"model_summary_{mrr_str}.csv")

    # read submission models
    with Pool(32) as pool:
        sub_predictions_dfs = pool.map(read_prediction, [fn for _, fn in preds_subs_all])

    sub_predictions = [(hsh, df) for ((hsh, fn), df) in zip(preds_subs_all, sub_predictions_dfs) if hsh in val_hashes]
    for coef, (hsh, df) in zip(coefs, sub_predictions):
        print(coef, hsh, df["click_proba"].min(), df["click_proba"].max())
Beispiel #8
0

def mrr_metric(train_data, preds):
    mrr = mrr_fast_v2(train_data, preds, df_val["clickout_id"].values)
    return "error", mrr, True


model = LGBMRanker(learning_rate=0.05,
                   n_estimators=900,
                   min_child_samples=5,
                   min_child_weight=0.00001,
                   n_jobs=-2)
model.fit(
    mat_train,
    df_train["was_clicked"],
    group=group_lengths(df_train["clickout_id"]),
    # sample_weight=np.where(df_train["clickout_step_rev"]==1,2,1),
    verbose=True,
    eval_set=[(mat_val, df_val["was_clicked"])],
    eval_group=[group_lengths(df_val["clickout_id"])],
    eval_metric=mrr_metric,
)

df_train["click_proba"] = model.predict(mat_train)
df_val["click_proba"] = model.predict(mat_val)

print(mrr_fast(df_val, "click_proba"))
print("By rank")
for n in range(1, 10):
    print(n, mrr_fast(df_val[df_val["clickout_step_rev"] == n], "click_proba"))