コード例 #1
0
class WA4Test(unittest.TestCase):
    def setUp(self):
        self.data = DataIO(verbose=False)
        self.data.load('testdata/ratings-ma4.csv')
        self.model = UserModel(normalize=False, verbose=False)
        self.model.build(self.data)

    def test_pearson(self):
        # test correlation
        S = pearson(self.model.R(), self.model.R()).todense()
        # 1. check we don't have numbers more than 1
        # user string comparison to avoid float nuances
        self.assertTrue('%.2f' % S.max() == '1.00')

        # 2. check there are only '1' on the diagonal
        self.assertTrue(
            sum([S[i, i] for i in range(S.shape[0])]) == S.shape[0])

        # 3. check a couple of correlation coefficients
        corr_test = [(1648, 5136, 0.40298), (918, 2824, -0.31706)]
        for (u1, u2, c) in corr_test:
            # check what's in the full matrix
            u1 = self.data.new_user_idx(u1)
            u2 = self.data.new_user_idx(u2)
            # check precomputed
            self.assertTrue('%.5f' % S[u1, u2] == '%.5f' % c)
            # compute here
            self.assertTrue(
                '%.5f' % pearson(self.model.R()[u1, :],
                                 self.model.R()[u2, :]).todense() == '%.5f' %
                c)

    def test_5nn(self):
        u = 3712
        nns = [(2824, 0.46291), (3867, 0.400275), (5062, 0.247693),
               (442, 0.22713), (3853, 0.19366)]
        S = pearson(self.model.R(), self.model.R())
        leave_top_n(S, 6)
        top_neighbours = [
            (self.data.old_user_idx(i), S[i, self.data.new_user_idx(u)])
            for i in S[:, self.data.new_user_idx(u)].nonzero()[0]
        ]
        top_neighbours.sort(key=lambda a: a[1], reverse=True)
        # skip the first element (corr = 1)
        self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]])
                        == ','.join(['%d,%.6f' % a for a in nns]))

    # consider moving this test to test_recsys.py
    def test_unnormalized(self):
        u = 3712
        expected = [(641, 5.000), (603, 4.856), (105, 4.739)]
        R = user_based_knn(self.model,
                           5, [self.data.new_user_idx(u)],
                           range(self.data.num_items()),
                           pearson,
                           promote_users=False)
        recs = top_ns([R], 3, keep_order=True)
        self.assertTrue(','.join(
            ['%d,%.3f' % (self.data.old_item_idx(a), b) for (
                a,
                b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))

    # consider moving this test to test_recsys.py
    def test_normalized(self):

        u = 3712
        expected = [(641, 5.900), (603, 5.546), (105, 5.501)]
        R = user_based_knn(self.model,
                           5, [self.data.new_user_idx(u)],
                           range(self.data.num_items()),
                           pearson,
                           promote_users=False,
                           normalize='normalize')
        recs = top_ns([R], 3, keep_order=True)
        self.assertTrue(','.join(
            ['%d,%.3f' % (self.data.old_item_idx(a), b) for (
                a,
                b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))