class WA4Test(unittest.TestCase): def setUp(self): self.data = DataIO(verbose=False) self.data.load('testdata/ratings-ma4.csv') self.model = UserModel(normalize=False, verbose=False) self.model.build(self.data) def test_pearson(self): # test correlation S = pearson(self.model.R(), self.model.R()).todense() # 1. check we don't have numbers more than 1 # user string comparison to avoid float nuances self.assertTrue('%.2f' % S.max() == '1.00') # 2. check there are only '1' on the diagonal self.assertTrue( sum([S[i, i] for i in range(S.shape[0])]) == S.shape[0]) # 3. check a couple of correlation coefficients corr_test = [(1648, 5136, 0.40298), (918, 2824, -0.31706)] for (u1, u2, c) in corr_test: # check what's in the full matrix u1 = self.data.new_user_idx(u1) u2 = self.data.new_user_idx(u2) # check precomputed self.assertTrue('%.5f' % S[u1, u2] == '%.5f' % c) # compute here self.assertTrue( '%.5f' % pearson(self.model.R()[u1, :], self.model.R()[u2, :]).todense() == '%.5f' % c) def test_5nn(self): u = 3712 nns = [(2824, 0.46291), (3867, 0.400275), (5062, 0.247693), (442, 0.22713), (3853, 0.19366)] S = pearson(self.model.R(), self.model.R()) leave_top_n(S, 6) top_neighbours = [ (self.data.old_user_idx(i), S[i, self.data.new_user_idx(u)]) for i in S[:, self.data.new_user_idx(u)].nonzero()[0] ] top_neighbours.sort(key=lambda a: a[1], reverse=True) # skip the first element (corr = 1) self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]]) == ','.join(['%d,%.6f' % a for a in nns])) # consider moving this test to test_recsys.py def test_unnormalized(self): u = 3712 expected = [(641, 5.000), (603, 4.856), (105, 4.739)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users=False) recs = top_ns([R], 3, keep_order=True) self.assertTrue(','.join( ['%d,%.3f' % (self.data.old_item_idx(a), b) for ( a, b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected])) # consider moving this test to test_recsys.py def test_normalized(self): u = 3712 expected = [(641, 5.900), (603, 5.546), (105, 5.501)] R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), pearson, promote_users=False, normalize='normalize') recs = top_ns([R], 3, keep_order=True) self.assertTrue(','.join( ['%d,%.3f' % (self.data.old_item_idx(a), b) for ( a, b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))