Example #1
0
    def test_pa3(self):
        testdata = zip([(1024, 77), (1024, 268), (1024, 462), (1024, 393),
                        (1024, 36955), (2048, 77), (2048, 36955), (2048, 788)],
                       [
                           "1024,77,4.3848,Memento (2000)",
                           "1024,268,2.8646,Batman (1989)",
                           "1024,462,3.1082,Erin Brockovich (2000)",
                           "1024,393,3.8722,Kill Bill: Vol. 2 (2004)",
                           "1024,36955,2.3524,True Lies (1994)",
                           "2048,77,4.8493,Memento (2000)",
                           "2048,36955,3.9698,True Lies (1994)",
                           "2048,788,3.8509,Mrs. Doubtfire (1993)",
                       ])

        data = DataIO(verbose=False)
        data.load('testdata/ratings.csv',
                  items_file='testdata/movie-titles.csv')
        model = UserModel(verbose=False, normalize=True)
        model.build(data)

        for ((u, i), s) in testdata:
            self.assertTrue(
                '%s' % s == '%d,%d,%.4f,%s' %
                (u, i,
                 user_based_knn(model,
                                30, [data.new_user_idx(u)],
                                [data.new_item_idx(i)],
                                cosine,
                                promote_users=True,
                                normalize='centered'), data.title(i)))
Example #2
0
class TestUserModel(unittest.TestCase):
    def setUp(self):
        self.data = DummyDataset()
        self.model = UserModel(verbose = False)
        self.model.build(self.data)

    def test_mean(self):
        expected = np.matrix([[ 3.2       ],
                              [ 3.94      ],
                              [ 4.13333333]])
        self.assertTrue(stringify_matrix(self.model.mean()) == stringify_matrix(expected))

    def test_users(self):
        expected = sparse.csr_matrix(
                      [[ 0.8       , -2.2       , 0.0       ,  1.4       ,  0.0       ],
                       [ 0.56      ,  0.06      , -0.44     , -0.24      ,  0.06      ],
                       [ 0.86666667,  0.0       , -0.73333333, 0.0       , -0.13333333]])
        self.assertTrue(stringify_matrix(self.model.R().todense()) == stringify_matrix(expected.todense()))
Example #3
0
    def test_pa3(self):
        testdata = zip([(1024,77),(1024,268),(1024,462),(1024,393),(1024,36955),(2048,77),(2048,36955),(2048,788)],
                       [
                        "1024,77,4.3848,Memento (2000)",
                        "1024,268,2.8646,Batman (1989)",
                        "1024,462,3.1082,Erin Brockovich (2000)",
                        "1024,393,3.8722,Kill Bill: Vol. 2 (2004)",
                        "1024,36955,2.3524,True Lies (1994)",
                        "2048,77,4.8493,Memento (2000)",
                        "2048,36955,3.9698,True Lies (1994)",
                        "2048,788,3.8509,Mrs. Doubtfire (1993)",
                        ])

        data = DataIO(verbose = False)
        data.load('testdata/ratings.csv', items_file = 'testdata/movie-titles.csv')
        model = UserModel(verbose = False, normalize = True)
        model.build(data)
        
        for ((u,i),s) in testdata:
            self.assertTrue('%s' % s ==
                            '%d,%d,%.4f,%s' % (u,i,user_based_knn(model, 30, [data.new_user_idx(u)],[data.new_item_idx(i)], 
                                                cosine, promote_users = True, normalize = 'centered'), data.title(i)))
Example #4
0
from model import UserModel
from suggest import top_ns

ratings_file = "ratings.csv"
given_users = [3867, 860]
NN = 5
n = 3
part_1_file = "part_1.csv"
part_2_file = "part_2.csv"

# part 1

data = DataIO()
data.load(ratings_file)
model = UserModel(normalize=False)
model.build(data)

given_users = data.translate_users(given_users)
given_items = range(data.num_items())

R = user_based_knn(model, NN, given_users, given_items, pearson, promote_users=False)
recs = top_ns(R, n, keep_order=True)

file = open(part_1_file, "w")
file.write("\n".join(["%d %.3f" % (data.old_item_idx(i), s) for u in recs for (i, s) in u]))
file.close()

# part 2

R = user_based_knn(
    model, NN, given_users, given_items, pearson, promote_users=False, exclude_seen=False, normalize=True
Example #5
0
from score import user_based_knn, cosine
from dataset import DataIO
from model import UserModel

ratings_file = '../data/ratings.csv'
items_file = '../data/movie-titles.csv'
NN = 30
answer_file = 'part_1.csv'

# part 1

data = DataIO()
data.load(ratings_file, items_file=items_file)
model = UserModel(normalize=True)
model.build(data)

inputs = [(4169, 161), (4169, 36955), (4169, 453), (4169, 857), (4169, 238),
          (5399, 1891), (5399, 14), (5399, 187), (5399, 602), (5399, 629),
          (3613, 329), (3613, 604), (3613, 134), (3613, 1637), (3613, 278),
          (1873, 786), (1873, 2502), (1873, 550), (1873, 1894), (1873, 1422),
          (4914, 268), (4914, 36658), (4914, 786), (4914, 161), (4914, 854)]

file = open(answer_file, 'w')
file.write('\n'.join([
    '%d,%d,%.4f,%s' %
    (u, i,
     user_based_knn(model,
                    NN, [data.new_user_idx(u)], [data.new_item_idx(i)],
                    cosine,
                    promote_users=True,
Example #6
0
class WA4Test(unittest.TestCase):

    def setUp(self):
        self.data = DataIO(verbose = False)
        self.data.load('testdata/ratings-ma4.csv')
        self.model = UserModel(normalize = False, verbose = False)
        self.model.build(self.data)

    def test_pearson(self):
        # test correlation
        S = pearson(self.model.R(), self.model.R()).todense()
        # 1. check we don't have numbers more than 1
        # user string comparison to avoid float nuances
        self.assertTrue('%.2f' % S.max() == '1.00');

        # 2. check there are only '1' on the diagonal
        self.assertTrue(sum([S[i,i] for i in range(S.shape[0])]) == S.shape[0])
        
        # 3. check a couple of correlation coefficients
        corr_test = [(1648, 5136, 0.40298),
                     (918, 2824, -0.31706)]
        for (u1,u2,c) in corr_test:
            # check what's in the full matrix 
            u1 = self.data.new_user_idx(u1)
            u2 = self.data.new_user_idx(u2)
            # check precomputed
            self.assertTrue('%.5f' % S[u1,u2] == '%.5f' % c)
            # compute here
            self.assertTrue('%.5f' % pearson(self.model.R()[u1,:], self.model.R()[u2,:]).todense() == '%.5f' % c)

    def test_5nn(self):
        u = 3712
        nns = [(2824,0.46291), (3867,0.400275), (5062,0.247693), (442,0.22713), (3853,0.19366)]
        S = pearson(self.model.R(), self.model.R())
        leave_top_n(S,6)
        top_neighbours = [(self.data.old_user_idx(i),S[i,self.data.new_user_idx(u)]) 
                                    for i in S[:,self.data.new_user_idx(u)].nonzero()[0]]
        top_neighbours.sort(key = lambda a: a[1], reverse = True)
        # skip the first element (corr = 1)
        self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]]) == 
                        ','.join(['%d,%.6f' % a for a in nns]))
    
    # consider moving this test to test_recsys.py
    def test_unnormalized(self):
       u = 3712
       expected = [(641,5.000), (603,4.856), (105,4.739)]
       R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), 
                pearson, promote_users = False)
       recs = top_ns([R],3, keep_order = True)
       self.assertTrue(','.join(['%d,%.3f' % (self.data.old_item_idx(a),b) for (a,b) in recs[0]]) == 
                       ','.join(['%d,%.3f' % a for a in expected]))

    # consider moving this test to test_recsys.py
    def test_normalized(self):

       u = 3712
       expected = [(641,5.900), (603,5.546), (105,5.501)]
       R = user_based_knn(self.model, 5, [self.data.new_user_idx(u)], range(self.data.num_items()), 
                pearson, promote_users = False, normalize = 'normalize')
       recs = top_ns([R],3, keep_order = True)
       self.assertTrue(','.join(['%d,%.3f' % (self.data.old_item_idx(a),b) for (a,b) in recs[0]]) == 
                       ','.join(['%d,%.3f' % a for a in expected]))
Example #7
0
class WA4Test(unittest.TestCase):
    def setUp(self):
        self.data = DataIO(verbose=False)
        self.data.load('testdata/ratings-ma4.csv')
        self.model = UserModel(normalize=False, verbose=False)
        self.model.build(self.data)

    def test_pearson(self):
        # test correlation
        S = pearson(self.model.R(), self.model.R()).todense()
        # 1. check we don't have numbers more than 1
        # user string comparison to avoid float nuances
        self.assertTrue('%.2f' % S.max() == '1.00')

        # 2. check there are only '1' on the diagonal
        self.assertTrue(
            sum([S[i, i] for i in range(S.shape[0])]) == S.shape[0])

        # 3. check a couple of correlation coefficients
        corr_test = [(1648, 5136, 0.40298), (918, 2824, -0.31706)]
        for (u1, u2, c) in corr_test:
            # check what's in the full matrix
            u1 = self.data.new_user_idx(u1)
            u2 = self.data.new_user_idx(u2)
            # check precomputed
            self.assertTrue('%.5f' % S[u1, u2] == '%.5f' % c)
            # compute here
            self.assertTrue(
                '%.5f' % pearson(self.model.R()[u1, :],
                                 self.model.R()[u2, :]).todense() == '%.5f' %
                c)

    def test_5nn(self):
        u = 3712
        nns = [(2824, 0.46291), (3867, 0.400275), (5062, 0.247693),
               (442, 0.22713), (3853, 0.19366)]
        S = pearson(self.model.R(), self.model.R())
        leave_top_n(S, 6)
        top_neighbours = [
            (self.data.old_user_idx(i), S[i, self.data.new_user_idx(u)])
            for i in S[:, self.data.new_user_idx(u)].nonzero()[0]
        ]
        top_neighbours.sort(key=lambda a: a[1], reverse=True)
        # skip the first element (corr = 1)
        self.assertTrue(','.join(['%d,%.6f' % a for a in top_neighbours[1:]])
                        == ','.join(['%d,%.6f' % a for a in nns]))

    # consider moving this test to test_recsys.py
    def test_unnormalized(self):
        u = 3712
        expected = [(641, 5.000), (603, 4.856), (105, 4.739)]
        R = user_based_knn(self.model,
                           5, [self.data.new_user_idx(u)],
                           range(self.data.num_items()),
                           pearson,
                           promote_users=False)
        recs = top_ns([R], 3, keep_order=True)
        self.assertTrue(','.join(
            ['%d,%.3f' % (self.data.old_item_idx(a), b) for (
                a,
                b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))

    # consider moving this test to test_recsys.py
    def test_normalized(self):

        u = 3712
        expected = [(641, 5.900), (603, 5.546), (105, 5.501)]
        R = user_based_knn(self.model,
                           5, [self.data.new_user_idx(u)],
                           range(self.data.num_items()),
                           pearson,
                           promote_users=False,
                           normalize='normalize')
        recs = top_ns([R], 3, keep_order=True)
        self.assertTrue(','.join(
            ['%d,%.3f' % (self.data.old_item_idx(a), b) for (
                a,
                b) in recs[0]]) == ','.join(['%d,%.3f' % a for a in expected]))