Beispiel #1
0
def split_train_test_userwise_random(
    df_: pd.DataFrame,
    user_colname: str,
    item_colname: str,
    item_ids: List[Any],
    heldout_ratio: float,
    n_heldout: Optional[int],
    rns: np.random.RandomState,
    rating_column: Optional[str] = None,
) -> UserTrainTestInteractionPair:
    """Split the user x item data frame into a pair of sparse matrix (represented as a UserDataSet).

    Parameters
    ----------
    df_:
        user x item interaction matrix.
    user_colname:
        The column name for the users.
    item_colname:
        The column name for the items.
    item_id_to_iid:
        The mapper from item id to item index. If not supplied, create own mapping from df_.
    heldout_ratio:
        The percentage of items (per-user) to be held out as a test(validation) ones.
    n_heldout:
        The maximal number of items (per-user) to be held out as a test(validation) ones.
    rns:
        The random state
    rating_column:
        The column for the rating values. If None, the rating values will be all equal (1), by default None
    Returns
    -------
    UserDataSet
        Resulting train-test split dataset.
    """

    df_ = df_[df_[item_colname].isin(item_ids)]

    item_indices = pd.Categorical(df_[item_colname], categories=item_ids).codes

    user_ids, user_indices = np.unique(df_[user_colname], return_inverse=True)
    if rating_column is not None:
        data = df_[rating_column].values
    else:
        data = np.ones(df_.shape[0], dtype=np.int32)

    X_all = sps.csr_matrix(
        (data, (user_indices, item_indices)),
        shape=(len(user_ids), len(item_ids)),
    )
    X_learn, X_predict = rowwise_train_test_split(
        X_all,
        heldout_ratio,
        n_heldout,
        random_seed=rns.randint(-(2**31), 2**31 - 1),
    )

    return UserTrainTestInteractionPair(user_ids, X_learn.tocsr(),
                                        X_predict.tocsr(), item_ids)
Beispiel #2
0
def test_split() -> None:
    warnings.simplefilter("always")

    X_1, X_2 = rowwise_train_test_split(X, test_ratio=0.5, random_seed=1)
    assert np.all((X - X_1 - X_2).toarray() == 0)

    # should have no overwrap
    assert np.all(X_1.multiply(X_2).toarray() == 0)
Beispiel #3
0
def split_train_test_userwise_random(
    df_: pd.DataFrame,
    user_colname: str,
    item_colname: str,
    item_ids: Union[List[Any], np.ndarray],
    heldout_ratio: float,
    n_heldout: Optional[int],
    rns: np.random.RandomState,
    rating_column: Optional[str] = None,
    ceil_n_heldout: bool = False,
) -> UserTrainTestInteractionPair:
    r"""Split the user x item data frame into a pair of sparse matrix (represented as a UserDataSet).

    Args
    ----------
    df_:
        user x item interaction matrix.
    user_colname:
        The column name for the users.
    item_colname:
        The column name for the items.
    item_id_to_iid:
        The mapper from item id to item index. If not supplied, create own mapping from df_.
    heldout_ratio:
        The percentage of items (per-user) to be held out as a test(validation) ones.
    n_heldout:
        The maximal number of items (per-user) to be held out as a test(validation) ones.
    rns:
        The random state
    rating_column:
        The column for the rating values. If None, the rating values will be all equal (1), by default None
    ceil_n_heldout:
        If this is `True` and `n_heldout` is `None`, the number of test interaction for a given user `u` will be
        `ceil(N_u * heldout_ratio)` where `N_u` is the number of interactions fo `u`.
        If this is `False`, `floor(N_u * heldout_ratio)` will be used instead. Defaults to `False`.

    Returns
    -------
    UserDataSet
        Resulting train-test split dataset.
    """
    X_all, user_ids, _ = df_to_sparse(
        df_,
        user_colname=user_colname,
        item_colname=item_colname,
        item_ids=item_ids,
        rating_colname=rating_column,
    )

    X_learn, X_predict = rowwise_train_test_split(
        X_all,
        heldout_ratio,
        n_heldout,
        random_state=rns,
        ceil_n_heldout=ceil_n_heldout)

    return UserTrainTestInteractionPair(user_ids, X_learn.tocsr(),
                                        X_predict.tocsr(), item_ids)
Beispiel #4
0
def test_split_fixed_n() -> None:
    X_1, X_2 = rowwise_train_test_split(X,
                                        test_ratio=0.5,
                                        n_test=1,
                                        random_seed=1)
    np.testing.assert_allclose(X.toarray(), (X_1 + X_2).toarray())

    # should have no overwrap
    assert np.all(X_1.multiply(X_2).toarray() == 0)
    X_2.data[:] = 1
    assert X_2.sum(axis=1).max() <= 1
Beispiel #5
0
def test_split() -> None:
    X_1, X_2 = rowwise_train_test_split(X, test_ratio=0.5, random_seed=1)
    np.testing.assert_allclose(X.toarray(), (X_1 + X_2).toarray())

    # should have no overwrap
    assert np.all(X_1.multiply(X_2).toarray() == 0)
    nnzs = X.indptr[1:] - X.indptr[:-1]
    X_2_bin = X_2.copy()
    X_2_bin.data[:] = 1.0
    X_2_nnzs = X_2_bin.sum(axis=1).A1
    assert np.all((nnzs * 0.5) >= X_2_nnzs)