Python numpy_stratified_split Examples

Programming Language: Python

Namespace/Package Name: reco_utils.dataset.python_splitters

Method/Function: numpy_stratified_split

Examples at hotexamples.com: 4

Python numpy_stratified_split - 4 examples found. These are the top rated real world Python examples of reco_utils.dataset.python_splitters.numpy_stratified_split extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def affinity_matrix(test_specs):
    """Generate a random user/item affinity matrix. By increasing the likehood of 0 elements we simulate
    a typical recommending situation where the input matrix is highly sparse.

    Args:
        users (int): number of users (rows).
        items (int): number of items (columns).
        ratings (int): rating scale, e.g. 5 meaning rates are from 1 to 5.
        spars: probability of obtaining zero. This roughly corresponds to the sparseness.
               of the generated matrix. If spars = 0 then the affinity matrix is dense.

    Returns:
        np.array: sparse user/affinity matrix of integers.

    """

    np.random.seed(test_specs["seed"])

    # uniform probability for the 5 ratings
    s = [(1 - test_specs["spars"]) / test_specs["ratings"]] * test_specs["ratings"]
    s.append(test_specs["spars"])
    P = s[::-1]

    # generates the user/item affinity matrix. Ratings are from 1 to 5, with 0s denoting unrated items
    X = np.random.choice(
        test_specs["ratings"] + 1, (test_specs["users"], test_specs["items"]), p=P
    )

    Xtr, Xtst = numpy_stratified_split(
        X, ratio=test_specs["ratio"], seed=test_specs["seed"]
    )

    return (Xtr, Xtst)

Example #2

Show file

File: test_python_splitter.py Project: Globalync/globalynk-r

def test_int_numpy_stratified_splitter(test_specs, python_int_dataset):
    # generate a syntetic dataset
    X = python_int_dataset

    # the splitter returns (in order): train and test user/affinity matrices, train and test datafarmes and user/items to matrix maps
    Xtr, Xtst = numpy_stratified_split(
        X, ratio=test_specs["ratio"], seed=test_specs["seed"]
    )

    # check that the generated matrices have the correct dimensions
    assert (Xtr.shape[0] == X.shape[0]) & (Xtr.shape[1] == X.shape[1])
    assert (Xtst.shape[0] == X.shape[0]) & (Xtst.shape[1] == X.shape[1])

    X_rated = np.sum(X != 0, axis=1)  # number of total rated items per user
    Xtr_rated = np.sum(Xtr != 0, axis=1)  # number of rated items in the train set
    Xtst_rated = np.sum(Xtst != 0, axis=1)  # number of rated items in the test set

    # global split: check that the all dataset is split in the correct ratio
    assert Xtr_rated.sum() / (X_rated.sum()) == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"]
    )

    assert Xtst_rated.sum() / (X_rated.sum()) == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"]
    )

    # This implementation of the stratified splitter performs a random split at the single user level. Here we check
    # that also this more stringent condition is verified. Note that user to user fluctuations in the split ratio
    # are stronger than for the entire dataset due to the random nature of the per user splitting.
    # For this reason we allow a slightly bigger tolerance, as specified in the test_specs()

    assert (
        (Xtr_rated / X_rated <= test_specs["ratio"] + test_specs["fluctuation"]).all()
        & (Xtr_rated / X_rated >= test_specs["ratio"] - test_specs["fluctuation"]).all()
    )

    assert (
        (
            Xtst_rated / X_rated
            <= (1 - test_specs["ratio"]) + test_specs["fluctuation"]
        ).all()
        & (
            Xtst_rated / X_rated
            >= (1 - test_specs["ratio"]) - test_specs["fluctuation"]
        ).all()
    )

Example #3

Show file

File: RBMmodel.py Project: GayatriReddiar/INFO7374-Algorithmic-Digital-Marketing

def RBMtrain():
    data = pd.read_csv("SnacksData100.csv")
    header = {
        "col_user": "******",
        "col_item": "Product_Id",
        "col_rating": "Ratings",
    }
    am = AffinityMatrix(DF=data, **header)
    X = am.gen_affinity_matrix()
    Xtr, Xtst = numpy_stratified_split(X)
    model = RBM(hidden_units=600,
                training_epoch=30,
                minibatch_size=60,
                keep_prob=0.9,
                with_metrics=True)
    model.fit(Xtr, Xtst)
    top_k, test_time = model.recommend_k_items(Xtst)
    top_k_df = am.map_back_sparse(top_k, kind='prediction')
    test_df = am.map_back_sparse(Xtst, kind='ratings')
    joblib.dump(top_k_df, 'testdata')

Example #4

Show file

header = {
    "col_user": "******",
    "col_item": "MovieID",
    "col_rating": "Rating",
}

# Use a sparse matrix representation rather than a pandas data frame
# for significant performance gain.

am = AffinityMatrix(DF=data, **header)
X = am.gen_affinity_matrix()

# Contstruct the training and test datasets.

Xtr, Xtst = numpy_stratified_split(X)

print('\nTraining matrix size (users, movies) is:', Xtr.shape)
print('Testing matrix size is: ', Xtst.shape)

# Initialize the model class. Note that through random variation we
# can get a much better performing model with seed=1!

model = RBM(
    hidden_units=600,
    training_epoch=30,
    minibatch_size=60,
    keep_prob=0.9,
    with_metrics=True,
    #           seed           = 1,
)