def fit_and_predict(self, df_train, df_val, validate=False): with timer("vectorizing train"): mat_train = self.vectorizer.fit_transform(df_train) print("Train shape", mat_train.shape) with timer("vectorinzg val"): mat_val = self.vectorizer.transform(df_val) print("Val shape", mat_val.shape) with timer("fitting model"): if isinstance(self.model, LGBMRanker): self.model.fit( mat_train, df_train["was_clicked"].values, group=group_lengths(df_train["clickout_id"].values) ) else: self.model.fit(mat_train, df_train["was_clicked"].values) if self.is_prob: val_pred = self.model.predict_proba(mat_val)[:, 1] if validate: train_pred = self.model.predict_proba(mat_train)[:, 1] self.evaluate(df_train, df_val, train_pred, val_pred) else: print("Predicting validation") val_pred = self.model.predict(mat_val) if validate: print("Predicting train") train_pred = self.model.predict(mat_train) self.evaluate(df_train, df_val, train_pred, val_pred) self.save_predictions(df_val, val_pred, validate) return val_pred
def run_model(mat_path, meta_path, model_instance, predictions_path, model_path, val, logger): with timer("read data"): meta = pd.read_hdf(meta_path, key="data") mat = h5sparse.File(mat_path, mode="r")["matrix"] with timer("split data"): if val: train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0] val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] else: train_ind = np.where(meta.is_test == 0)[0] val_ind = np.where(meta.is_test == 1)[0] logger.info(f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind] X_train = mat[train_ind.min() : (train_ind.max() + 1)] X_val = mat[val_ind.min() : (val_ind.max() + 1)] del mat gc.collect() with timer("fit model"): model_instance.fit( X_train, meta_train["was_clicked"].values, group=group_lengths(meta_train["clickout_id"].values) ) joblib.dump(model_instance, model_path) val_pred = model_instance.predict(X_val) train_pred = model_instance.predict(X_train) logger.info("Train AUC {:.4f}".format(roc_auc_score(meta_train["was_clicked"].values, train_pred))) if val: logger.info("Val AUC {:.4f}".format(roc_auc_score(meta_val["was_clicked"].values, val_pred))) meta_val["click_proba"] = val_pred if val: logger.info("Val MRR {:.4f}".format(mrr_fast(meta_val, "click_proba"))) meta_val.to_csv(predictions_path, index=False)
def load_train_val(self, n_users, n_debug=None): with timer("Reading training data"): if n_debug: df_all = pd.read_csv(self.datapath, nrows=n_debug) else: df_all = pd.read_csv(self.datapath) if self.reduce_df_memory: df_all = reduce_mem_usage(df_all) if n_users: train_users = set( np.random.choice(df_all[df_all["is_test"] == 0].user_id.unique(), n_users, replace=False) ) # select a frozen set of users' clickouts for validation df_all = df_all[(df_all.user_id.isin(train_users)) | (df_all.is_val == 1)] print("Training on {} users".format(df_all["user_id"].nunique())) print("Training data shape", df_all.shape) with timer("splitting timebased"): df_train = df_all[df_all["is_val"] == 0] df_val = df_all[df_all["is_val"] == 1] print("df_train shape", df_train.shape) print("df_val shape", df_val.shape) return df_train, df_val
def load_train_test(self, n_users): with timer("Reading training and testing data"): df_all = pd.read_csv(self.datapath) if self.reduce_df_memory: df_all = reduce_mem_usage(df_all) df_test = df_all[df_all["is_test"] == 1] if n_users: train_users = set( np.random.choice(df_all[df_all["is_test"] == 0].user_id.unique(), n_users, replace=False) ) # always include all the users from the test set train_users |= set(df_all[df_all["is_test"] == 1].user_id.unique()) df_all = df_all[(df_all.user_id.isin(train_users)) & (df_all.is_test == 0)] print("Training on {} users".format(df_all["user_id"].nunique())) print("Training data shape", df_all.shape) return df_all, df_test
from lightgbm import LGBMRanker import sys sys.path.append('/home/janice/el/recsys2019/src') from recsys.config import BEST_PARAMS from recsys.log_utils import get_logger from recsys.metric import mrr_fast from recsys.utils import group_lengths, timer, get_git_hash from sklearn.metrics import roc_auc_score logger = get_logger() print("Staring validation") with timer("reading data"): meta = pd.read_hdf("../../data/proc/vectorizer_1/meta.h5", key="data") mat = h5sparse.File("../../data/proc/vectorizer_1/Xcsr.h5", mode="r")["matrix"] with timer("splitting data"): split_idx = 4000000 train_ind = np.where((meta.is_val == 0) & (meta.is_test == 0))[0][:split_idx] # val_ind = np.where((meta.is_val == 1) & (meta.is_test == 0))[0] val_ind = np.arange(split_idx, 4868466) print("train_ind: {} / val_ind: {}".format(train_ind, val_ind)) logger.info( f"Train shape {train_ind.shape[0]} Val shape {val_ind.shape[0]}") meta_train = meta.iloc[train_ind] meta_val = meta.iloc[val_ind]