Ejemplo n.º 1
0
def test_user_level_split_val_fixed_n() -> None:
    ratio = 0.3
    dataset, mid_list = split_dataframe_partial_user_holdout(
        df,
        user_column="userId",
        item_column="movieId",
        n_test_user=30,
        n_val_user=30,
        n_heldout_val=1,
        heldout_ratio_test=ratio,
    )
    assert len(mid_list) == len(set(df.movieId))

    train = dataset["train"]
    train_invalid = UserTrainTestInteractionPair(
        train.user_ids, train.X_train[:, :-1], None
    )
    with pytest.raises(ValueError):
        train_invalid.concat(train)
    with pytest.raises(ValueError):
        _ = UserTrainTestInteractionPair(
            train.user_ids, train.X_train, train.X_train[1:]
        )

    val = dataset["val"]
    test = dataset["test"]
    assert train.X_test.count_nonzero() == 0
    train_val = train.concat(val)
    assert train_val.X_test[: train.n_users].count_nonzero() == 0
    assert (train_val.X_test[train.n_users :] - val.X_test).count_nonzero() == 0

    assert (
        train_val.X_train - sps.vstack([train.X_all, val.X_train])
    ).count_nonzero() == 0

    val_X_test = val.X_test
    assert np.all(val_X_test.sum(axis=1).A1 <= 1)

    X_learn = test.X_train
    X_predict = test.X_test
    assert X_predict is not None
    intersect = X_learn.multiply(X_predict)
    assert intersect.count_nonzero() == 0
    index = RNS.choice(np.arange(test.n_users), size=10)
    for i in index:
        nnz_learn = X_learn[i].nonzero()[1].shape[0]
        nnz_predict = X_predict[i].nonzero()[1].shape[0]
        assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict)
        assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)
Ejemplo n.º 2
0
def test_extreme_case() -> None:
    ratio = 0.3
    dataset, mid_list = split_dataframe_partial_user_holdout(
        df,
        user_column="userId",
        item_column="movieId",
        n_heldout_val=1,
        val_user_ratio=1.0,
        test_user_ratio=0,
        heldout_ratio_test=ratio,
    )
    assert len(mid_list) == len(set(df.movieId))

    assert dataset["train"].n_users == 0
    assert dataset["test"].n_users == 0
    assert dataset["val"].n_users == len(set(df.userId))
    assert dataset["val"].X_all.nnz == df.shape[0]
def test_user_level_split() -> None:
    dataset, mid_list = split_dataframe_partial_user_holdout(
        df,
        user_column="userId",
        item_column="movieId",
        n_test_user=30,
        n_val_user=30,
        heldout_ratio_val=0.3,
        heldout_ratio_test=0.5,
    )
    train = dataset["train"]
    train_invalid = UserTrainTestInteractionPair(train.user_ids,
                                                 train.X_train[:, :-1], None)
    with pytest.raises(ValueError):
        train_invalid.concat(train)
    with pytest.raises(ValueError):
        invalid_arg = UserTrainTestInteractionPair(train.user_ids,
                                                   train.X_train,
                                                   train.X_train[1:])

    val = dataset["val"]
    test = dataset["test"]
    assert train.X_test.count_nonzero() == 0
    train_val = train.concat(val)
    assert train_val.X_test[:train.n_users].count_nonzero() == 0
    assert (train_val.X_test[train.n_users:] - val.X_test).count_nonzero() == 0

    assert (train_val.X_train -
            sps.vstack([train.X_all, val.X_train])).count_nonzero() == 0

    for user_data, ratio in [(val, 0.3), (test, 0.5)]:
        X_learn = user_data.X_train
        X_predict = user_data.X_test
        assert X_predict is not None
        intersect = X_learn.multiply(X_predict)
        assert intersect.count_nonzero() == 0
        index = RNS.choice(np.arange(user_data.n_users), size=10)
        for i in index:
            nnz_learn = X_learn[i].nonzero()[1].shape[0]
            nnz_predict = X_predict[i].nonzero()[1].shape[0]
            assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict)
            assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)
Ejemplo n.º 4
0
os.environ["OMP_NUM_THREADS"] = "8"
os.environ["IRSPACK_NUM_THREADS_DEFAULT"] = "8"

if __name__ == "__main__":

    BASE_CUTOFF = 20

    data_manager = MovieLens1MDataManager()
    df_all = data_manager.read_interaction()

    data_all, _ = split_dataframe_partial_user_holdout(
        df_all,
        "userId",
        "movieId",
        test_user_ratio=0.2,
        val_user_ratio=0.2,
        heldout_ratio_test=0.5,
        heldout_ratio_val=0.5,
    )

    data_train = data_all["train"]
    data_val = data_all["val"]
    data_test = data_all["test"]

    X_train_all: sps.csr_matrix = sps.vstack(
        [data_train.X_train, data_val.X_train, data_test.X_train],
        format="csr")
    X_train_val_all: sps.csr_matrix = sps.vstack(
        [data_train.X_all, data_val.X_all, data_test.X_train], format="csr")
    valid_evaluator = Evaluator(
Ejemplo n.º 5
0
    BASE_CUTOFF = 100

    # We follow the preprocessing of Mult-VAE implementation (https://github.com/dawenl/vae_cf)
    data_manager = MovieLens20MDataManager()
    df_all = data_manager.read_interaction()
    df_all = df_all[df_all.rating >= 4]
    user_cnt = df_all.userId.value_counts()
    user_cnt = user_cnt[user_cnt >= 5]
    df_all = df_all[df_all.userId.isin(user_cnt.index)]

    data_all, _ = split_dataframe_partial_user_holdout(
        df_all,
        "userId",
        "movieId",
        n_test_user=10000,
        n_val_user=10000,
        heldout_ratio_val=0.2,
        heldout_ratio_test=0.2,
    )

    data_train = data_all["train"]
    data_val = data_all["val"]
    data_test = data_all["test"]

    X_train_val_all: sps.csr_matrix = sps.vstack(
        [data_train.X_all, data_val.X_all], format="csr")
    valid_evaluator = EvaluatorWithColdUser(
        input_interaction=data_val.X_train,
        ground_truth=data_val.X_test,
        cutoff=BASE_CUTOFF,
Ejemplo n.º 6
0
def test_user_level_split(
    n_val_user: Optional[int],
    n_test_user: Optional[int],
    val_user_ratio: float,
    test_user_ratio: float,
    time_colname: Optional[str],
) -> None:
    n_users_all = len(set(df.userId))
    dataset, mid_list = split_dataframe_partial_user_holdout(
        df,
        user_column="userId",
        item_column="movieId",
        time_column=time_colname,
        val_user_ratio=val_user_ratio,
        test_user_ratio=test_user_ratio,
        n_val_user=n_val_user,
        n_test_user=n_test_user,
        heldout_ratio_val=0.3,
        heldout_ratio_test=0.5,
    )
    assert len(mid_list) == len(set(df.movieId))
    train = dataset["train"]
    train_invalid = UserTrainTestInteractionPair(train.user_ids,
                                                 train.X_train[:, :-1], None)
    with pytest.raises(ValueError):
        train_invalid.concat(train)
    with pytest.raises(ValueError):
        _ = UserTrainTestInteractionPair(train.user_ids, train.X_train,
                                         train.X_train[1:])

    with pytest.raises(ValueError):
        _ = UserTrainTestInteractionPair(train.user_ids, train.X_train,
                                         train.X_train, mid_list[:-1])

    def get_n_right_answer(ratio: float, n: Optional[int]) -> int:
        if n is not None:
            return n
        else:
            return int(n_users_all * ratio)

    val = dataset["val"]
    assert val.n_users == get_n_right_answer(val_user_ratio, n_val_user)
    test = dataset["test"]
    assert test.n_users == get_n_right_answer(test_user_ratio, n_test_user)

    if time_colname is not None:
        for d in [val, test]:
            _df_train = d.df_train().merge(
                df[["userId", "movieId",
                    "timestamp"]].rename(columns={
                        "userId": "user_id",
                        "movieId": "item_id"
                    }))
            _df_test = d.df_test().merge(
                df[["userId", "movieId",
                    "timestamp"]].rename(columns={
                        "userId": "user_id",
                        "movieId": "item_id"
                    }))
            _train_max_time = _df_train.groupby("user_id").timestamp.max()
            _test_min_time = _df_test.groupby("user_id").timestamp.min()
            common_index = np.intersect1d(_train_max_time.index,
                                          _test_min_time.index)
            assert common_index.shape[0] > 0
            assert np.all(
                _train_max_time.reindex(common_index) <=
                _test_min_time.reindex(common_index))

    assert train.X_test.count_nonzero() == 0
    train_val = train.concat(val)
    assert train_val.X_test[:train.n_users].count_nonzero() == 0
    assert (train_val.X_test[train.n_users:] - val.X_test).count_nonzero() == 0

    assert (train_val.X_train -
            sps.vstack([train.X_all, val.X_train])).count_nonzero() == 0

    for user_data, ratio in [(val, 0.3), (test, 0.5)]:
        X_learn = user_data.X_train
        X_predict = user_data.X_test
        assert X_predict is not None
        intersect = X_learn.multiply(X_predict)
        assert intersect.count_nonzero() == 0
        index = RNS.choice(np.arange(user_data.n_users), size=10)
        for i in index:
            nnz_learn = X_learn[i].nonzero()[1].shape[0]
            nnz_predict = X_predict[i].nonzero()[1].shape[0]
            assert ratio >= (nnz_predict - 1) / (nnz_learn + nnz_predict)
            assert ratio <= (nnz_predict + 1) / (nnz_learn + nnz_predict)