Exemple #1
0
def test_old_style_algo(small_ml):
    '''Test that old algorithms (i.e. algoritms that only define train()) can
    support both calls to fit() and to train()
    - supporting algo.fit() is needed so that custom algorithms that only
    define train() can still use up to date tools (such as evalute, which has
    been updated to use fit()).
    - algo.train() is the old way, and must still be supported for custom
    algorithms and tools.
    '''
    class CustomAlgoTrain(AlgoBase):
        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def train(self, trainset):

            AlgoBase.train(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()

    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i

    with pytest.warns(UserWarning):
        algo = CustomAlgoTrain()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.train has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoTrain.train
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of train() is only called once
        assert algo.cnt == i
Exemple #2
0
def test_new_style_algo(small_ml):
    '''Test that new algorithms (i.e. algoritms that only define fit()) can
    support both calls to fit() and to train()
    - algo.fit() is the new way of doing things
    - supporting algo.train() is needed for the (unlikely?) case where a user
    has defined custom tools that use algo.train().
    '''
    class CustomAlgoFit(AlgoBase):
        def __init__(self):
            AlgoBase.__init__(self)
            self.cnt = -1

        def fit(self, trainset):

            AlgoBase.fit(self, trainset)
            self.est = 3
            self.bu, self.bi = 1, 1
            self.cnt += 1

        def estimate(self, u, i):
            return self.est

    algo = CustomAlgoFit()
    kf = KFold(n_splits=2)
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        algo.fit(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i

    algo = CustomAlgoFit()
    for i, (trainset, testset) in enumerate(kf.split(small_ml)):
        with pytest.warns(UserWarning):
            algo.train(trainset)
        predictions = algo.test(testset)

        # Make sure AlgoBase.fit has been called
        assert hasattr(algo, 'trainset')
        # Make sure CustomAlgoFit.fit has been called
        assert all(est == 3 for (_, _, _, est, _) in predictions)
        # Make sure AlgoBase.fit is finished before CustomAlgoFit.fit
        assert (algo.bu, algo.bi) == (1, 1)
        # Make sure the rest of fit() is only called once
        assert algo.cnt == i
Exemple #3
0
def test_KFold(toy_data):

    # Test n_folds parameter
    kf = KFold(n_splits=5)
    assert len(list(kf.split(toy_data))) == 5

    with pytest.raises(ValueError):
        kf = KFold(n_splits=10)
        next(kf.split(toy_data))  # Too big (greater than number of ratings)

    with pytest.raises(ValueError):
        kf = KFold(n_splits=1)
        next(kf.split(toy_data))  # Too low (must be >= 2)

    # Make sure data has not been shuffled. If not shuffled, the users in the
    # testsets are 0, 1, 2... 4 (in that order).
    kf = KFold(n_splits=5, shuffle=False)
    users = [int(testset[0][0][-1]) for (_, testset) in kf.split(toy_data)]
    assert users == list(range(5))

    # Make sure that when called two times without shuffling, folds are the
    # same.
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b
    # test once again with another KFold instance
    kf = KFold(n_splits=5, shuffle=False)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # We'll now shuffle b and check that folds are different.
    # (this is conditioned by seed setting at the beginning of file)
    kf = KFold(n_splits=5, random_state=None, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b
    # test once again: two calls to kf.split make different splits when
    # random_state=None
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a != testsets_b

    # Make sure that folds are the same when same KFold instance is used with
    # suffle is True but random_state is set to some value
    kf = KFold(n_splits=5, random_state=1, shuffle=True)
    testsets_a = [testset for (_, testset) in kf.split(toy_data)]
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
    assert testsets_a == testsets_b

    # Make sure raw ratings are not shuffled by KFold
    old_raw_ratings = copy(toy_data.raw_ratings)
    kf = KFold(n_splits=5, shuffle=True)
    next(kf.split(toy_data))
    assert old_raw_ratings == toy_data.raw_ratings

    # Make sure kf.split() and the old toy_data.split() have the same folds.
    np.random.seed(3)
    with pytest.warns(UserWarning):
        toy_data.split(2, shuffle=True)
        testsets_a = [testset for (_, testset) in toy_data.folds()]
    kf = KFold(n_splits=2, random_state=3, shuffle=True)
    testsets_b = [testset for (_, testset) in kf.split(toy_data)]
Exemple #4
0
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls


data = Dataset.load_builtin('ml-100k')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))
from amaze import accuracy
from amaze.model_selection import KFold

data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)

    print('On trainset,', end=' ')
    predictions = algo.test(trainset_cv.build_testset())
    accuracy.rmse(predictions, verbose=True)