def test_user_level_split_val_fixed_n() -> None: ratio = 0.3 dataset, mid_list = split_dataframe_partial_user_holdout( df, user_column="userId", item_column="movieId", n_test_user=30, n_val_user=30, n_heldout_val=1, heldout_ratio_test=ratio, ) assert len(mid_list) == len(set(df.movieId)) train = dataset["train"] train_invalid = UserTrainTestInteractionPair( train.user_ids, train.X_train[:, :-1], None ) with pytest.raises(ValueError): train_invalid.concat(train) with pytest.raises(ValueError): _ = UserTrainTestInteractionPair( train.user_ids, train.X_train, train.X_train[1:] ) val = dataset["val"] test = dataset["test"] assert train.X_test.count_nonzero() == 0 train_val = train.concat(val) assert train_val.X_test[: train.n_users].count_nonzero() == 0 assert (train_val.X_test[train.n_users :] - val.X_test).count_nonzero() == 0 assert ( train_val.X_train - sps.vstack([train.X_all, val.X_train]) ).count_nonzero() == 0 val_X_test = val.X_test assert np.all(val_X_test.sum(axis=1).A1 <= 1) X_learn = test.X_train X_predict = test.X_test assert X_predict is not None intersect = X_learn.multiply(X_predict) assert intersect.count_nonzero() == 0 index = RNS.choice(np.arange(test.n_users), size=10) for i in index: nnz_learn = X_learn[i].nonzero()[1].shape[0] nnz_predict = X_predict[i].nonzero()[1].shape[0] assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict) assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)
def test_extreme_case() -> None: ratio = 0.3 dataset, mid_list = split_dataframe_partial_user_holdout( df, user_column="userId", item_column="movieId", n_heldout_val=1, val_user_ratio=1.0, test_user_ratio=0, heldout_ratio_test=ratio, ) assert len(mid_list) == len(set(df.movieId)) assert dataset["train"].n_users == 0 assert dataset["test"].n_users == 0 assert dataset["val"].n_users == len(set(df.userId)) assert dataset["val"].X_all.nnz == df.shape[0]
def test_user_level_split() -> None: dataset, mid_list = split_dataframe_partial_user_holdout( df, user_column="userId", item_column="movieId", n_test_user=30, n_val_user=30, heldout_ratio_val=0.3, heldout_ratio_test=0.5, ) train = dataset["train"] train_invalid = UserTrainTestInteractionPair(train.user_ids, train.X_train[:, :-1], None) with pytest.raises(ValueError): train_invalid.concat(train) with pytest.raises(ValueError): invalid_arg = UserTrainTestInteractionPair(train.user_ids, train.X_train, train.X_train[1:]) val = dataset["val"] test = dataset["test"] assert train.X_test.count_nonzero() == 0 train_val = train.concat(val) assert train_val.X_test[:train.n_users].count_nonzero() == 0 assert (train_val.X_test[train.n_users:] - val.X_test).count_nonzero() == 0 assert (train_val.X_train - sps.vstack([train.X_all, val.X_train])).count_nonzero() == 0 for user_data, ratio in [(val, 0.3), (test, 0.5)]: X_learn = user_data.X_train X_predict = user_data.X_test assert X_predict is not None intersect = X_learn.multiply(X_predict) assert intersect.count_nonzero() == 0 index = RNS.choice(np.arange(user_data.n_users), size=10) for i in index: nnz_learn = X_learn[i].nonzero()[1].shape[0] nnz_predict = X_predict[i].nonzero()[1].shape[0] assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict) assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)
os.environ["OMP_NUM_THREADS"] = "8" os.environ["IRSPACK_NUM_THREADS_DEFAULT"] = "8" if __name__ == "__main__": BASE_CUTOFF = 20 data_manager = MovieLens1MDataManager() df_all = data_manager.read_interaction() data_all, _ = split_dataframe_partial_user_holdout( df_all, "userId", "movieId", test_user_ratio=0.2, val_user_ratio=0.2, heldout_ratio_test=0.5, heldout_ratio_val=0.5, ) data_train = data_all["train"] data_val = data_all["val"] data_test = data_all["test"] X_train_all: sps.csr_matrix = sps.vstack( [data_train.X_train, data_val.X_train, data_test.X_train], format="csr") X_train_val_all: sps.csr_matrix = sps.vstack( [data_train.X_all, data_val.X_all, data_test.X_train], format="csr") valid_evaluator = Evaluator(
BASE_CUTOFF = 100 # We follow the preprocessing of Mult-VAE implementation (https://github.com/dawenl/vae_cf) data_manager = MovieLens20MDataManager() df_all = data_manager.read_interaction() df_all = df_all[df_all.rating >= 4] user_cnt = df_all.userId.value_counts() user_cnt = user_cnt[user_cnt >= 5] df_all = df_all[df_all.userId.isin(user_cnt.index)] data_all, _ = split_dataframe_partial_user_holdout( df_all, "userId", "movieId", n_test_user=10000, n_val_user=10000, heldout_ratio_val=0.2, heldout_ratio_test=0.2, ) data_train = data_all["train"] data_val = data_all["val"] data_test = data_all["test"] X_train_val_all: sps.csr_matrix = sps.vstack( [data_train.X_all, data_val.X_all], format="csr") valid_evaluator = EvaluatorWithColdUser( input_interaction=data_val.X_train, ground_truth=data_val.X_test, cutoff=BASE_CUTOFF,
def test_user_level_split( n_val_user: Optional[int], n_test_user: Optional[int], val_user_ratio: float, test_user_ratio: float, time_colname: Optional[str], ) -> None: n_users_all = len(set(df.userId)) dataset, mid_list = split_dataframe_partial_user_holdout( df, user_column="userId", item_column="movieId", time_column=time_colname, val_user_ratio=val_user_ratio, test_user_ratio=test_user_ratio, n_val_user=n_val_user, n_test_user=n_test_user, heldout_ratio_val=0.3, heldout_ratio_test=0.5, ) assert len(mid_list) == len(set(df.movieId)) train = dataset["train"] train_invalid = UserTrainTestInteractionPair(train.user_ids, train.X_train[:, :-1], None) with pytest.raises(ValueError): train_invalid.concat(train) with pytest.raises(ValueError): _ = UserTrainTestInteractionPair(train.user_ids, train.X_train, train.X_train[1:]) with pytest.raises(ValueError): _ = UserTrainTestInteractionPair(train.user_ids, train.X_train, train.X_train, mid_list[:-1]) def get_n_right_answer(ratio: float, n: Optional[int]) -> int: if n is not None: return n else: return int(n_users_all * ratio) val = dataset["val"] assert val.n_users == get_n_right_answer(val_user_ratio, n_val_user) test = dataset["test"] assert test.n_users == get_n_right_answer(test_user_ratio, n_test_user) if time_colname is not None: for d in [val, test]: _df_train = d.df_train().merge( df[["userId", "movieId", "timestamp"]].rename(columns={ "userId": "user_id", "movieId": "item_id" })) _df_test = d.df_test().merge( df[["userId", "movieId", "timestamp"]].rename(columns={ "userId": "user_id", "movieId": "item_id" })) _train_max_time = _df_train.groupby("user_id").timestamp.max() _test_min_time = _df_test.groupby("user_id").timestamp.min() common_index = np.intersect1d(_train_max_time.index, _test_min_time.index) assert common_index.shape[0] > 0 assert np.all( _train_max_time.reindex(common_index) <= _test_min_time.reindex(common_index)) assert train.X_test.count_nonzero() == 0 train_val = train.concat(val) assert train_val.X_test[:train.n_users].count_nonzero() == 0 assert (train_val.X_test[train.n_users:] - val.X_test).count_nonzero() == 0 assert (train_val.X_train - sps.vstack([train.X_all, val.X_train])).count_nonzero() == 0 for user_data, ratio in [(val, 0.3), (test, 0.5)]: X_learn = user_data.X_train X_predict = user_data.X_test assert X_predict is not None intersect = X_learn.multiply(X_predict) assert intersect.count_nonzero() == 0 index = RNS.choice(np.arange(user_data.n_users), size=10) for i in index: nnz_learn = X_learn[i].nonzero()[1].shape[0] nnz_predict = X_predict[i].nonzero()[1].shape[0] assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict) assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)