Example #1
0
def test_build_full_trainset():
    """Test the build_full_trainset method."""

    custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) +
                           '/custom_dataset')
    data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)

    trainset = data.build_full_trainset()

    assert len(trainset.rm) == 5
    assert len(trainset.ur) == 5
    assert len(trainset.ir) == 2
    assert trainset.n_users == 5
    assert trainset.n_items == 2
Example #2
0
def test_trainset_testset():
    """Test the construct_trainset and construct_testset methods."""

    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    for trainset, testset in data.folds():
        pass  # just need trainset and testset to be set

    # test rm:
    rm = trainset.rm
    assert rm[0, 0] == 4
    assert rm[1, 0] == 4
    assert rm[3, 1] == 5
    assert rm[40, 20000] == 0  # not in the trainset

    # test ur
    ur = trainset.ur
    assert ur[0] == [(0, 4)]
    assert ur[1] == [(0, 4), (1, 2)]
    assert ur[40] == []  # not in the trainset

    # test ir
    ir = trainset.ir
    assert ir[0] == [(0, 4), (1, 4), (2, 1)]
    assert ir[1] == [(1, 2), (2, 1), (3, 5)]
    assert ir[20000] == []  # not in the trainset

    # test n_users, n_items, r_min, r_max
    assert trainset.n_users == 4
    assert trainset.n_items == 2
    assert trainset.r_min == 1
    assert trainset.r_max == 5

    # test raw2inner: ensure inner ids are given in proper order
    raw2inner_id_users = trainset._raw2inner_id_users
    for i in range(4):
        assert raw2inner_id_users['user' + str(i)] == i

    raw2inner_id_items = trainset._raw2inner_id_items
    for i in range(2):
        assert raw2inner_id_items['item' + str(i)] == i
Example #3
0
def test_split():
    """Test the split method."""

    custom_dataset_path = (os.path.dirname(os.path.realpath(__file__)) +
                           '/custom_dataset')
    data = Dataset.load_from_file(file_path=custom_dataset_path, reader=reader)

    # Test n_folds parameter
    data.split(5)
    assert len(list(data.folds())) == 5

    with pytest.raises(ValueError):
        data.split(10)
        for fold in data.folds():
            pass

    with pytest.raises(ValueError):
        data.split(1)
        for fold in data.folds():
            pass

    # Test the shuffle parameter
    data.split(n_folds=3, shuffle=False)
    testsets_a = [testset for (_, testset) in data.folds()]
    data.split(n_folds=3, shuffle=False)
    testsets_b = [testset for (_, testset) in data.folds()]
    assert testsets_a == testsets_b

    # We'll shuffle and check that folds are now different. There's a chance
    # that they're still the same, just by lack of luck. If after 10000 tries
    # the're still the same, there's a high probability that our code is
    # faulty. If we're very (very very very) unlucky, it may fail though (or
    # loop for eternity).
    i = 0
    while testsets_a == testsets_b:
        data.split(n_folds=3, shuffle=True)
        testsets_b = [testset for (_, testset) in data.folds()]
        i += 1
    assert i < 10000

    # Ensure that folds are the same if split is not called again
    testsets_a = [testset for (_, testset) in data.folds()]
    testsets_b = [testset for (_, testset) in data.folds()]
    assert testsets_a == testsets_b
Example #4
0
from recsys import AlgoBase
from recsys import Dataset
from recsys import evaluate


class MyOwnAlgorithm(AlgoBase):
    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def train(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.train(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean(
            [r for (_, _, r) in self.trainset.all_ratings()])

    def estimate(self, u, i):

        return self.the_mean


data = Dataset.load_builtin('ml-100k')
algo = MyOwnAlgorithm()

evaluate(algo, data)
Example #5
0
if it were not built-in.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from recsys import BaselineOnly
from recsys import Dataset
from recsys import evaluate
from recsys import Reader

# path to dataset folder
files_dir = '/home/nico/.recsys_data/ml-100k/ml-100k/'  # change this

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader)

# We'll use an algorithm that predicts baseline estimates.
algo = BaselineOnly()

# Evaluate performances of our algorithm on the dataset.
evaluate(algo, data)
Example #6
0
from recsys import evaluate
from recsys import Reader
from recsys import NormalPredictor
from recsys import KNNWithMeans
from recsys import SVD
from recsys import KNNBasic

# path to dataset file
# 使用pip install recsys 可以下载相关库
# using pip install recsys will dowload all packages you needed

file_path = './train.data0'  # change this

reader = Reader(line_format='user item rating', sep=' ',rating_scale=(0, 100))

data = Dataset.load_from_file(file_path, reader=reader)

data.split(n_folds=10)
#trainset=data.build_full_trainset()
sim_options={'name':'cosine','user_based':False}
# We'll use an algorithm that predicts baseline estimates.
#algo = BaselineOnly()
#algo = NormalPredictor()

#algo=SVD()
algo=KNNBasic()

from recsys.prediction_algorithms.predictions import Prediction
from recsys import accuracy
for trainset,testset in data.folds():
    algo.train(trainset)
Example #7
0
def test_wrong_file_name():
    """Ensure file names are checked when creating a (custom) Dataset."""
    wrong_files = [('does_not_exist', 'does_not_either')]

    with pytest.raises(ValueError):
        Dataset.load_from_folds(folds_files=wrong_files, reader=reader)