Ejemplo n.º 1
0
def test_deprecated_way():
    """Test all Dataset constructors without passing rating_scale as a
    parameter. Make sure we revert back to the Reader object, with a warning
    message.

    Also, make sure ValueError is raised if reader has no rating_scale in this
    context.

    Not using dataset fixtures here for more control.
    """

    # test load_from_file
    toy_data_path = (os.path.dirname(os.path.realpath(__file__)) +
                     '/custom_dataset')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=(1, 5))
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating', sep=' ', skip_lines=3,
                        rating_scale=None)
        data = Dataset.load_from_file(file_path=toy_data_path,
                                      reader=reader)

    # test load_from_folds
    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    with pytest.warns(UserWarning):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=(1, 5))
        data = Dataset.load_from_folds([(train_file, test_file)], reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(line_format='user item rating timestamp', sep='\t',
                        rating_scale=None)
        data = Dataset.load_from_folds([(train_file, test_file)],
                                       reader=reader)
    # test load_from_df
    ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                    'userID': [9, 32, 2, 45, '10000'],
                    'rating': [3, 2, 4, 3, 1]}
    df = pd.DataFrame(ratings_dict)

    with pytest.warns(UserWarning):
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                    reader=reader)
    with pytest.raises(ValueError):
        reader = Reader(rating_scale=None)
        data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],  # noqa
                                    reader=reader)
Ejemplo n.º 2
0
def test_zero_rating_canary():

    ratings_dict = {'itemID': [0, 0, 0, 0, 1, 1],
                    'userID': [0, 1, 2, 3, 3, 4],
                    'rating': [-10, 10, 0, -5, 0, 5]}
    df = pd.DataFrame(ratings_dict)
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(-10, 10))
    trainset = data.build_full_trainset()

    # test ur and ir fields. Kind of OK, but the purpose of the test is
    # precisely to test what would happen if we removed them...
    assert trainset.ir[0] == [(0, -10), (1, 10), (2, 0), (3, -5)]
    assert trainset.ir[1] == [(3, 0), (4, 5)]

    assert trainset.ur[0] == [(0, -10)]
    assert trainset.ur[1] == [(0, 10)]
    assert trainset.ur[2] == [(0, 0)]
    assert trainset.ur[3] == [(0, -5), (1, 0)]
    assert trainset.ur[4] == [(1, 5)]
    print(trainset.ur)

    # ... so also test all_ratings which should be more reliable.
    all_ratings = list(trainset.all_ratings())
    assert (0, 0, -10) in all_ratings
    assert (1, 0, 10) in all_ratings
    assert (2, 0, 0) in all_ratings
    assert (3, 0, -5) in all_ratings
    assert (3, 1, 0) in all_ratings
    assert (4, 1, 5) in all_ratings
Ejemplo n.º 3
0
def test_load_form_df():
    """Ensure reading dataset from pandas dataframe is OK."""

    # DF creation.
    ratings_dict = {
        'itemID': [1, 1, 1, 2, 2],
        'userID': [9, 32, 2, 45, '10000'],
        'rating': [3, 2, 4, 3, 1]
    }
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))

    # Assert split and folds can be used without problems
    with pytest.warns(UserWarning):
        data.split(2)
        assert sum(1 for _ in data.folds()) == 2

    # assert users and items are correctly mapped
    trainset = data.build_full_trainset()
    assert trainset.knows_user(trainset.to_inner_uid(9))
    assert trainset.knows_user(trainset.to_inner_uid('10000'))
    assert trainset.knows_item(trainset.to_inner_iid(2))

    # assert r(9, 1) = 3 and r(2, 1) = 4
    uid9 = trainset.to_inner_uid(9)
    uid2 = trainset.to_inner_uid(2)
    iid1 = trainset.to_inner_iid(1)
    assert trainset.ur[uid9] == [(iid1, 3)]
    assert trainset.ur[uid2] == [(iid1, 4)]

    # mess up the column ordering and assert that users are not correctly
    # mapped
    data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']],
                                rating_scale=(1, 5))
    trainset = data.build_full_trainset()
    with pytest.raises(ValueError):
        trainset.to_inner_uid('10000')
Ejemplo n.º 4
0
def test_build_anti_testset():
    ratings_dict = {
        'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]
    }
    df = pd.DataFrame(ratings_dict)

    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                                rating_scale=(1, 5))
    with pytest.warns(UserWarning):
        data.split(2)
        trainset, __testset = next(data.folds())
    # fill with some specific value
    for fillvalue in (0, 42., -1):
        anti = trainset.build_anti_testset(fill=fillvalue)
        for (u, i, r) in anti:
            assert r == fillvalue
    # fill with global_mean
    anti = trainset.build_anti_testset(fill=None)
    for (u, i, r) in anti:
        assert r == trainset.global_mean
    expect = trainset.n_users * trainset.n_items
    assert trainset.n_ratings + len(anti) == expect
Ejemplo n.º 5
0
"""
This module descibes how to load a dataset from a pandas dataframe.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from amaze import NormalPredictor
from amaze import Dataset
from amaze.model_selection import cross_validate

# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {
    'itemID': [1, 1, 1, 2, 2],
    'userID': [9, 32, 2, 45, 'user_foo'],
    'rating': [3, 2, 4, 3, 1]
}
df = pd.DataFrame(ratings_dict)

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                            rating_scale=(1, 5))

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)