Example #1
0
    def fit_and_predict(self, df_train, df_val, validate=False):
        with timer("vectorizing train"):
            mat_train = self.vectorizer.fit_transform(df_train)
            print("Train shape", mat_train.shape)
        with timer("vectorinzg val"):
            mat_val = self.vectorizer.transform(df_val)
            print("Val shape", mat_val.shape)

        with timer("fitting model"):
            if isinstance(self.model, LGBMRanker):
                self.model.fit(
                    mat_train, df_train["was_clicked"].values, group=group_lengths(df_train["clickout_id"].values)
                )
            else:
                self.model.fit(mat_train, df_train["was_clicked"].values)

        if self.is_prob:
            val_pred = self.model.predict_proba(mat_val)[:, 1]
            if validate:
                train_pred = self.model.predict_proba(mat_train)[:, 1]
                self.evaluate(df_train, df_val, train_pred, val_pred)
        else:
            print("Predicting validation")
            val_pred = self.model.predict(mat_val)
            if validate:
                print("Predicting train")
                train_pred = self.model.predict(mat_train)
                self.evaluate(df_train, df_val, train_pred, val_pred)
        self.save_predictions(df_val, val_pred, validate)
        return val_pred
Example #2
0
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger):
    with timer("read data"):
        meta = pd.read_hdf(meta_path, key="data")
        mat = h5sparse.File(mat_path, mode="r")["matrix"]

    with timer("split data"):
        if val:
            train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0]
            val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
        else:
            train_ind = np.where(meta.is_test == 0)[0]
            val_ind = np.where(meta.is_test == 1)[0]

        logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
        meta_train = meta.iloc[train_ind]
        meta_val = meta.iloc[val_ind]
        X_train = mat[train_ind.min() : (train_ind.max() + 1)]
        X_val = mat[val_ind.min() : (val_ind.max() + 1)]
        del mat
        gc.collect()

    with timer("fit model"):
        model_instance.fit(
            X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values)
        )
        joblib.dump(model_instance, model_path)
        val_pred = model_instance.predict(X_val)
        train_pred = model_instance.predict(X_train)
        logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred)))
        if val:
            logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred)))
        meta_val["click_proba"] = val_pred
        if val:
            logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba")))
        meta_val.to_csv(predictions_path, index=False)
Example #3
0
 def load_train_val(self, n_users, n_debug=None):
     with timer("Reading training data"):
         if n_debug:
             df_all = pd.read_csv(self.datapath, nrows=n_debug)
         else:
             df_all = pd.read_csv(self.datapath)
             if self.reduce_df_memory:
                 df_all = reduce_mem_usage(df_all)
             if n_users:
                 train_users = set(
                     np.random.choice(df_all[df_all["is_test"] == 0].user_id.unique(), n_users, replace=False)
                 )
                 # select a frozen set of users' clickouts for validation
                 df_all = df_all[(df_all.user_id.isin(train_users)) | (df_all.is_val == 1)]
         print("Training on {} users".format(df_all["user_id"].nunique()))
         print("Training data shape", df_all.shape)
     with timer("splitting timebased"):
         df_train = df_all[df_all["is_val"] == 0]
         df_val = df_all[df_all["is_val"] == 1]
         print("df_train shape", df_train.shape)
         print("df_val shape", df_val.shape)
     return df_train, df_val
Example #4
0
 def load_train_test(self, n_users):
     with timer("Reading training and testing data"):
         df_all = pd.read_csv(self.datapath)
         if self.reduce_df_memory:
             df_all = reduce_mem_usage(df_all)
         df_test = df_all[df_all["is_test"] == 1]
         if n_users:
             train_users = set(
                 np.random.choice(df_all[df_all["is_test"] == 0].user_id.unique(), n_users, replace=False)
             )
             # always include all the users from the test set
             train_users |= set(df_all[df_all["is_test"] == 1].user_id.unique())
             df_all = df_all[(df_all.user_id.isin(train_users)) & (df_all.is_test == 0)]
         print("Training on {} users".format(df_all["user_id"].nunique()))
         print("Training data shape", df_all.shape)
     return df_all, df_test
from lightgbm import LGBMRanker

import sys
sys.path.append('/home/janice/el/recsys2019/src')

from recsys.config import BEST_PARAMS
from recsys.log_utils import get_logger
from recsys.metric import mrr_fast
from recsys.utils import group_lengths, timer, get_git_hash
from sklearn.metrics import roc_auc_score

logger = get_logger()

print("Staring validation")

with timer("reading data"):
    meta = pd.read_hdf("../../data/proc/vectorizer_1/meta.h5", key="data")
    mat = h5sparse.File("../../data/proc/vectorizer_1/Xcsr.h5",
                        mode="r")["matrix"]

with timer("splitting data"):
    split_idx = 4000000
    train_ind = np.where((meta.is_val == 0)
                         & (meta.is_test == 0))[0][:split_idx]
    # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0]
    val_ind = np.arange(split_idx, 4868466)
    print("train_ind: {} / val_ind: {}".format(train_ind, val_ind))
    logger.info(
        f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}")
    meta_train = meta.iloc[train_ind]
    meta_val = meta.iloc[val_ind]