def _convert_hash(self, dataset): data = Data() for key in dataset: record = dataset[key] batch = [(record[k], key, k) for k in record] data.set(batch, extend=True) return data
def test_data_extend(): dataset = [(1,2,3), (4,5,6)] dataset2 = [(7,8,9), (10,11,12)] data = Data() data.set(dataset) assert_equal(len(data), 2) data.set(dataset2, extend=True) assert_equal(len(data), 4)
def test_data_extend(): dataset = [(1, 2, 3), (4, 5, 6)] dataset2 = [(7, 8, 9), (10, 11, 12)] data = Data() data.set(dataset) assert_equal(len(data), 2) data.set(dataset2, extend=True) assert_equal(len(data), 4)
def update(self, USER_ID, baseline, path, pred_items): print "Loading tweet occurrences pickle..." baseline.get_data()._load_pickle(path=path + "tweet_occurrences.p") tweet_occurrences = baseline.get_data().get() print "Loading count_dict pickle..." count_dict = cPickle.load(open(path + "count_dict.p")) print "Loading occurrences pickle..." occurrences = cPickle.load(open(path + "occurrences.p")) total_count = count_dict[USER_ID] upd_total_count = int(total_count) + len(pred_items) count_dict[USER_ID] = int(upd_total_count) print "Dumping count_dict pickle..." cPickle.dump(count_dict, open(path + "count_dict.p", "wb"), 2) print "Updating counts for known artists..." for index, (count, item_id, user_id) in enumerate(tweet_occurrences): if str(user_id).encode('utf-8') == USER_ID: item_id = str(item_id).encode('utf-8') count = occurrences[(item_id, USER_ID)] upd_count = float(count) / float(upd_total_count) occurrences[(item_id, USER_ID)] = float(upd_count) baseline._matrix.set_value(item_id, USER_ID, float(upd_count)) tweet_occurrences[index] = (float(upd_count), item_id, user_id) print "Updating counts for recommended artists..." for item_id, relevance in pred_items: count = (1.0 / float(upd_total_count)) baseline._matrix.set_value(item_id, USER_ID, float(count)) occurrences[(item_id, USER_ID)] = float(count) tweet_occurrences.append((float(count), item_id, USER_ID)) print "Dumping tweet occurrences pickle..." data_tweet_occurrences = Data() data_tweet_occurrences.set(tweet_occurrences) baseline.set_data(data_tweet_occurrences) baseline.save_data(filename=path + "tweet_occurrences.p", pickle=True) print "Dumping occurrence pickle..." cPickle.dump(occurrences, open(path + "occurrences.p", "wb"), protocol=2) print "Dumping sparse matrix pickle..." cPickle.dump(baseline._matrix.get(), open(path + "sparse_matrix.p", "w"), protocol=2)
def get_movie(movie_id): movie = {} rating = 0 with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,)) movie_result = cur.fetchone() cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,)) directors = cur.fetchall() cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,)) actors = cur.fetchall() cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,)) writers = cur.fetchall() cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,)) genres = cur.fetchall() if 'session_user' in request.cookies: cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,)) rating = cur.fetchone() cur.execute("SELECT * FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) similar_list = [str(s[0]) for s in svd.similar(int(movie_id))] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list))) similar_movies = cur.fetchall() movie = { 'mid': movie_result[0], 'title': movie_result[1], 'description': movie_result[2], 'image': movie_result[3], 'year': movie_result[4], 'directors': [d[0] for d in directors], 'writers': [w[0] for w in writers], 'actors': [a[0] for a in actors], 'genres': [g[0] for g in genres], 'rating': rating, 'similar_movies': similar_movies, } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/movie.html', movie=movie, session_user=session_user)
def get_feeds(): movielist = {} with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],)) if cur.fetchone(): cur.execute("SELECT ratings, movie_id, user_id FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations))) similar_movies = cur.fetchall() for m in similar_movies: movielist[m] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } else: cur.execute("SELECT * FROM movies") movies = cur.fetchall() for m in movies: cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],)) avg = cur.fetchone()[0] movielist[avg] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/feeds.html', movielist=movielist, session_user=session_user)
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data svd = SVD() data = Data() data.load(path='../data/userchlfav',# force=True, sep=',' , format={'col':0, 'row':1, 'ids': int} #, 'value':2 , pickle=False) print len(data._data) for rate in data._data: rate[0] data.set([rate for rate in data._data if rate[1]<1000]) print len(data._data) svd.set_data(data) k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) #ITEMID1 = 1 # Toy Story (1995) #ITEMID2 = 2355 # A bug's life (1998)
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'), (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')] d = Data() d.set(data) svd = SVD() svd.set_data(d) m = svd.get_matrix() svd.compute(k=2) print svd.similar('user1') print svd.predict('user1', 'item1')