class Algorithm(object): def __init__(self): self._data = Data() def __repr__(self): s = '%d rows.' % len(self.get_data()) if len(self.get_data()): s += '\nE.g: %s' % str(self.get_data()[0]) return s def __len__(self): return len(self.get_data()) def get_data(self): return self._data def set_data(self, data): self._data = data def add_tuple(self, tuple): self.get_data().add_tuple(tuple) def load_data(self, filename, sep='\t', format={'value':0, 'row':1, 'col':2}): self._data.load_file(filename, sep, format) def compute(self): if not self._data.get(): raise ValueError('No data set. Matrix is empty!')
def get_preference(user_List): #generate list of users preference_dict={} user_map={} data = Data() #saving rating data i=1 for user in user_List: user_id=(str(user)) url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?\ key=147CBF377C6B648EC3DC73499CE73D32&steamid="+user+"&format=json" response = urllib2.urlopen(url) owned_gameData = json.loads(response.read().decode('utf-8-sig')) user_Pref={} #print (user) try: if owned_gameData['response']['game_count']!=0: user_Pref={} for games in owned_gameData['response']['games']: if games['playtime_forever']>0: user_Pref[games['appid']]= math.log(games['playtime_forever']) data.add_tuple((math.log(games['playtime_forever'], 10), games['appid'], i)) user_map[i]=user except: continue i=i+1 preference_dict[user]=user_Pref data.save('rating.dat')
def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def train_and_save(filename): step = filename.split('.')[-1] data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Already exists: svdn_model_{step}.zip'.format(step=step)) except: svd = SVD() svd.set_data(train) svd.compute( k=100, min_values=2, pre_normalize=False, mean_center=True, post_normalize=True, savefile='svdn_model_{step}'.format(step=step) ) print('Saved svdn_model_{step}.zip'.format(step=step))
def _convert_hash(self, dataset): data = Data() for key in dataset: record = dataset[key] batch = [(record[k], key, k) for k in record] data.set(batch, extend=True) return data
def build_model(self,uids,kn): data = Data() for uid,songs in uids.items(): for song in songs: data.add_tuple((1,song,uid)) svd = SVD() svd.set_data(data) svd.compute(k=kn,min_values=1) self.model = svd
def test_data_extend(): dataset = [(1,2,3), (4,5,6)] dataset2 = [(7,8,9), (10,11,12)] data = Data() data.set(dataset) assert_equal(len(data), 2) data.set(dataset2, extend=True) assert_equal(len(data), 4)
def load_ratings(filename): """ Load ratings """ data = Data() format = {'col':0, 'row':1, 'value':2, 'ids': 'int'} data.load(filename, sep=',', format=format) return data
def getAverageRating(ITEMID): averageRating = 0 totalUsers = 0 data = Data() data.load('./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) for rating, item_id, user_id in data.get(): if(item_id == ITEMID): totalUsers += 1 averageRating += rating print averageRating/totalUsers
def get_data_model_matrix(data): """ This method process raw data and store rating/users/movies in a matrix <value/row/column> respectively using recsys library :return: data object (recsys.datamodel.Data()) ) """ processed_data = Data() for user, review in data.items(): for mov, rat in review.items(): processed_data.add_tuple((rat, user, mov)) return processed_data
def calculate_stats_features(pct_train): dat_file='feature_matrix.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False, post_normalize=False) return svd,train,test
def setup_svd(self, vote_list): if self.svd is None: self.cache['svd'] = SVD() data = Data() for vote in vote_list: user_id = vote[0].id item_id = vote[1] value = float(vote[2]) data.add_tuple((value, item_id, user_id)) # Tuple format is: <value, row, column> self.cache['svd'].set_data(data) self.cache['svd'].compute(k=self.k, min_values=1) return self.svd
def get_friend_matrix(u_ids, raw_data): idata = Data() u_idx = 0 for u_id in u_ids: u_idx += 1 i_idx = 0 i_ids = raw_data[u_id].keys() for i_id in i_ids: i_idx += 1 rate, ts = raw_data[u_id][i_id] idata.add_tuple((float(rate),u_idx,i_idx)) return idata
def prepare_data(raw_data): idata = Data() u_idx = 0 for u_id in raw_data.keys(): i_idx = 0 u_idx += 1 pre_u_raw_data = raw_data[u_id] for i_id in pre_u_raw_data.keys(): i_idx += 1 rate, _ = pre_u_raw_data[i_id] idata.add_tuple((float(rate),u_idx,i_idx)) return idata
def test_save_n_load(percent_train, modelKlass = SVD, dataFname ='/Users/jennyyuejin/recommender/Data/movieData/u.data', dataFormat = {'col':0, 'row':1, 'value':2, 'ids':int}): data = Data() data.load(dataFname, sep='\t', format=dataFormat) print '------ evaluating original' train, test = data.split_train_test(percent=percent_train, shuffle_data=False) print len(train), 'training data points;', len(test), 'testing data points' #Create SVD K=100 svd = modelKlass() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) evaluate(svd, test) svd.save_model('./model/svd.obj.zip', {'k': K, 'min_values': 5, 'pre_normalize': None, 'mean_center': True, 'post_normalize': True}) print '------ evaluating copy' data2 = Data() data2.load(dataFname, sep='\t', format=dataFormat) _, test2 = data2.split_train_test(percent=percent_train, shuffle_data=False) # reload data print len(test2), 'testing data points' svd_pred = modelKlass() svd_pred.load_model('./model/svd.obj.zip') evaluate(svd_pred, test2)
def setUp(self): data = Data() for stars, item_id, user_id in ratings: data.add_tuple((stars, item_id, user_id)) movies = dict() for mid, name, genres in movie_genres: movie = Item(mid) movie.add_data({'name': name, 'genres': genres}) movies[mid] = movie self.ratings = data self.movies = movies
def read_user_data_from_ratings(data_file): data = Data() format = {'col':0, 'row':1, 'value':2, 'ids': 'int'} data.load(dat_file, sep='::', format=format) userdict = {} for d in data.get(): if d[2] in userdict: user = userdict[d[2]] else: user = User(d[2]) user.add_item(d[1],d[0]) userdict[d[2]] = user return userdict
def update(self, USER_ID, baseline, path, pred_items): print "Loading tweet occurrences pickle..." baseline.get_data()._load_pickle(path=path + "tweet_occurrences.p") tweet_occurrences = baseline.get_data().get() print "Loading count_dict pickle..." count_dict = cPickle.load(open(path + "count_dict.p")) print "Loading occurrences pickle..." occurrences = cPickle.load(open(path + "occurrences.p")) total_count = count_dict[USER_ID] upd_total_count = int(total_count) + len(pred_items) count_dict[USER_ID] = int(upd_total_count) print "Dumping count_dict pickle..." cPickle.dump(count_dict, open(path + "count_dict.p", "wb"), 2) print "Updating counts for known artists..." for index, (count, item_id, user_id) in enumerate(tweet_occurrences): if str(user_id).encode('utf-8') == USER_ID: item_id = str(item_id).encode('utf-8') count = occurrences[(item_id, USER_ID)] upd_count = float(count) / float(upd_total_count) occurrences[(item_id, USER_ID)] = float(upd_count) baseline._matrix.set_value(item_id, USER_ID, float(upd_count)) tweet_occurrences[index] = (float(upd_count), item_id, user_id) print "Updating counts for recommended artists..." for item_id, relevance in pred_items: count = (1.0 / float(upd_total_count)) baseline._matrix.set_value(item_id, USER_ID, float(count)) occurrences[(item_id, USER_ID)] = float(count) tweet_occurrences.append((float(count), item_id, USER_ID)) print "Dumping tweet occurrences pickle..." data_tweet_occurrences = Data() data_tweet_occurrences.set(tweet_occurrences) baseline.set_data(data_tweet_occurrences) baseline.save_data(filename=path + "tweet_occurrences.p", pickle=True) print "Dumping occurrence pickle..." cPickle.dump(occurrences, open(path + "occurrences.p", "wb"), protocol=2) print "Dumping sparse matrix pickle..." cPickle.dump(baseline._matrix.get(), open(path + "sparse_matrix.p", "w"), protocol=2)
def get_movie(movie_id): movie = {} rating = 0 with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,)) movie_result = cur.fetchone() cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,)) directors = cur.fetchall() cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,)) actors = cur.fetchall() cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,)) writers = cur.fetchall() cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,)) genres = cur.fetchall() if 'session_user' in request.cookies: cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,)) rating = cur.fetchone() cur.execute("SELECT * FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) similar_list = [str(s[0]) for s in svd.similar(int(movie_id))] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list))) similar_movies = cur.fetchall() movie = { 'mid': movie_result[0], 'title': movie_result[1], 'description': movie_result[2], 'image': movie_result[3], 'year': movie_result[4], 'directors': [d[0] for d in directors], 'writers': [w[0] for w in writers], 'actors': [a[0] for a in actors], 'genres': [g[0] for g in genres], 'rating': rating, 'similar_movies': similar_movies, } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/movie.html', movie=movie, session_user=session_user)
def build_svd_item_based(user_op_item_cnt, item_op_users, user_idx, item_idx, min_nonzero): svd = SVD() data = Data() item_lst = [] for ui in user_op_item_cnt: if len(user_op_item_cnt[ui]) < min_nonzero: continue for ti in user_op_item_cnt[ui]: if item_op_users[ti] < min_nonzero: continue if 1.0*user_op_item_cnt[ui][ti] < 1: continue item_lst.append(ti) data.add_tuple(((1.0*user_op_item_cnt[ui][ti]), item_idx[ti], user_idx[ui])) item_lst = list(set(item_lst)) svd.set_data(data) return svd, item_lst
def test_utf8_data(): data_in = Data() NUM_PLAYS = 69 ITEMID = u'Bj\xf6rk' data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1]) NUM_PLAYS = 34 ITEMID = 'Björk' data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2]) data_in.save(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8')) data_saved = Data() data_saved.load(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8')) assert_equal(len(data_in), len(data_saved))
def get_mae_rmse(step): data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} filename = 'second_train_test.dat.{step}'.format(step=step) data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Loading model... {step}'.format(step=step)) except: return mae_predicted, rmse_predicted = [], [] for rating, item_id, user_id in test: try: predicted = svd.predict(item_id, user_id) mae_predicted.append((rating, predicted)) rmse_predicted.append((rating, predicted)) except: pass mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def build_svd_cat_based(user_op_cat_cnt, cat_op_users, user_idx, cat_idx, min_nonzero): svd = SVD() data = Data() cat_lst = [] for ui in user_op_cat_cnt: if len(user_op_cat_cnt[ui]) < min_nonzero: continue for ci in user_op_cat_cnt[ui]: if cat_op_users[ci] < min_nonzero: continue if 1.0*user_op_cat_cnt[ui][ci] < 1: continue cat_lst.append(ci) data.add_tuple(((1.0*user_op_cat_cnt[ui][ci]), cat_idx[ci], user_idx[ui])) cat_lst = list(set(cat_lst)) print 'cat =', len(cat_lst) svd.set_data(data) return svd, cat_lst
def similar_users(user): if not type(user) is str: user = unidecode.unidecode(user) if db.done_users.find_one({'user':user})['recommended']==False: user_files = db.user_list.find({'user':user}) f = open('./dc_recom.dat','a') for u in user_files: f.write(u['user'] + '::' + u['tth']) f.write('\n') f.close() db.done_users.update({'user': user}, {'user':user, 'recommended': True}) data = Data() data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0}) svd = SVD() svd.set_data(data) svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) return [i[0] for i in svd.similar(user)]
def get_feeds(): movielist = {} with sqlite3.connect('data/data100.db') as con: cur = con.cursor() cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],)) if cur.fetchone(): cur.execute("SELECT ratings, movie_id, user_id FROM ratings") rating_results = cur.fetchall() d = Data() d.set(rating_results) # with open('data/tmp.dat', 'a') as f: # for l in rating_results: # f.write('%d,%d,%d\n' % (l[0], l[1], l[2])) svd = SVD() # svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int}) svd.set_data(d) recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)] cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations))) similar_movies = cur.fetchall() for m in similar_movies: movielist[m] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } else: cur.execute("SELECT * FROM movies") movies = cur.fetchall() for m in movies: cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],)) avg = cur.fetchone()[0] movielist[avg] = { 'mid': m[0], 'title': m[1], 'description': m[2], 'image': m[3], 'year': m[4] } session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None return template('static/feeds.html', movielist=movielist, session_user=session_user)
def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False): """ Loads a dataset file See params definition in *datamodel.Data.load()* """ if force: self._data = Data() self._matrix_similarity = None self._data.load(filename, force, sep, format, pickle)
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def parse_data(): filename = '../data/ml-1m/ratings.dat' data = Data() format = {'col':0, 'row':1, 'value':2, 'ids': int} data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) # 80% train, 20% test data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)
def main(): svd = SVD() train = Data() test = Data() train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int}) test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int}) svd.set_data(train) svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True) # rmse = RMSE() # mae = MAE() # for rating, item_id, user_id in test.get(): # try: # pred_rating = svd.predict(item_id, user_id) # rmse.add(rating, pred_rating) # mae.add(rating, pred_rating) # except KeyError: # continue # print 'RMSE=%s' % rmse.compute() # print 'MAE=%s' % mae.compute() # test = make_test() # print precision_and_recall(test, svd) # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False) print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
def recommended_files(user): if not type(user) is str: user = unidecode.unidecode(user) if db.done_users.find_one({'user':user})['recommended']==False: user_files = db.user_list.find({'user':user}) f = open('./dc_recom.dat','a') for u in user_files: f.write(u['user'] + '::' + u['tth']) f.write('\n') f.close() db.done_users.update({'user': user}, {'user':user, 'recommended': True}) data = Data() data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0}) svd = SVD() svd.set_data(data) svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) similar_users = [i[0] for i in svd.similar(user,n=10)] newdata = Data() for i in range(0,len(similar_users),1): files = db.user_list.find({'user':similar_users[i]}) for f in files: newdata.add_tuple((1.0,similar_users[i],f['tth'])) svd.set_data(newdata) svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=100) res = [] c_res = 0 for p in recoms: flag=0 for r in res: if similar(db.tths.find_one({'tth':p[0]})['name'],db.tths.find_one({'tth':r[0]})['name']): flag = 1 break if flag == 0: res.append(p) c_res += 1 if c_res > 10: k = [] for i in res: try: j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name']) except: j = 'magnet:?xt=urn:tree:tiger:'+i[0] k.append(j) return k k = [] for i in res: try: j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name']) except: j = 'magnet:?xt=urn:tree:tiger:'+i[0] k.append(j) return k
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'), (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')] d = Data() d.set(data) svd = SVD() svd.set_data(d) m = svd.get_matrix() svd.compute(k=2) print svd.similar('user1') print svd.predict('user1', 'item1')
class Collaborative_filtering(object): def __init__(self, ratings_file, movies): #No need to pass as ,will be provided in views.py #self.users = users self.movies = movies self.K = 100 self.PERCENT_TRAIN = 85 #Need to provide a default file location for ratings.csv instead of loading everytime.run below 2lines only once #or just provide this file instead. #self.users.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index= False) self.ratings_file = ratings_file #Give your path to ratings.csv created from above 2 lines. self.data = None self.svd = None self.recommend_movies_list = None self.recommend_movies_ids = None self.similar_movies_list = None self.similar_movies_ids = None self.movie_id = None self.train = None self.test = None def compute_svd(self): ''' ratings = pd.read_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index_col= False) ratings = ratings.ix[1:] ratings.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index = False) self.data = Data() self.data.load(self.ratings_file, sep=',', format={'col':0, 'row':1 ,'value':2, 'ids':float}) self.train , self.test = self.data.split_train_test(percent=self.PERCENT_TRAIN) self.svd = SVD() self.svd.set_data(self.train) self.svd.compute(k=self.K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)''' self.data = Data() self.data.load(self.ratings_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) self.train, self.test = self.data.split_train_test(percent=85) self.svd = SVDNeighbourhood() self.svd.set_data(self.train) self.svd.compute(k=100, min_values=1, pre_normalize=None, mean_center=False, post_normalize=True) def similarity_measure( self, movie1, movie2): #gives a similarity measure value between -1 to 1 return round(self.svd.similarity(movie1, movie2), 4) def recommend_movies(self, user_id): l = self.svd.recommend(user_id, n=10, only_unknowns=True, is_row=False) self.recommend_movies_list = [] self.recommend_movies_ids = [] for p in l: #movie names bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split() q = bb.index('Name:') bb = ' '.join(bb[1:q]) self.recommend_movies_list.append(bb) #movie ids gg = movies.ix[movies['movie_id'] == p[0]] gg = gg.reset_index() del gg['index'] gg = gg.ix[:, 0:2].as_matrix(columns=None).tolist() self.recommend_movies_ids.append(gg[0][0]) return self.recommend_movies_list, self.recommend_movies_ids def get_similar_movies(self, movie1): #Returns a PYTHON list for similar movies. movie1 = int(movie1) l = self.svd.similar(movie1) self.similar_movies_list = [] self.similar_movies_ids = [] l = l[1:] for p in l: #getting movie names bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split() q = bb.index('Name:') bb = ' '.join(bb[1:q]) self.similar_movies_list.append(bb) #getting movie id's self.similar_movies_ids.append(p[0]) return self.similar_movies_list, self.similar_movies_ids
class Algorithm(object): """ Base class Algorithm It has the basic methods to load a dataset, get the matrix and the raw input data, add more data (tuples), etc. Any other Algorithm derives from this base class """ def __init__(self): self._data = Data() self._matrix = SparseMatrix() self._matrix_similarity = None #self-similarity matrix (only for the input Matrix rows) self._matrix_and_data_aligned = False #both Matrix and Data contain the same info? def __len__(self): return len(self.get_data()) def __repr__(self): s = '%d rows.' % len(self.get_data()) if len(self.get_data()): s += '\nE.g: %s' % str(self.get_data()[0]) return s def get_matrix(self): """ :returns: matrix *M* """ if not self._matrix.get(): self.create_matrix() return self._matrix def get_matrix_similarity(self): """ :returns: the self-similarity matrix """ return self._matrix_similarity def set_data(self, data): """ Sets the raw dataset (input for matrix *M*) :param data: a Dataset class (list of tuples <value, row, col>) :type data: Data """ #self._data = Data() #self._data.set(data) self._data = data self._matrix_and_data_aligned = False def get_data(self): """ :returns: An instance of Data class. The raw dataset (input for matrix *M*). """ return self._data def add_tuple(self, tuple): """ Add a tuple in the dataset :param tuple: a tuple containing <rating, user, item> information. Or, more general: <value, row, col> """ self.get_data().add_tuple(tuple) self._matrix_and_data_aligned = False def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False): """ Loads a dataset file See params definition in *datamodel.Data.load()* """ if force: self._data = Data() self._matrix_similarity = None self._data.load(filename, force, sep, format, pickle) def save_data(self, filename, pickle=False): """ Saves the dataset in divisi2 matrix format (i.e: value <tab> row <tab> col) :param filename: file to store the data :type filename: string :param pickle: save in pickle format? :type filename: boolean """ self._data.save(filename, pickle) def create_matrix(self): if VERBOSE: sys.stdout.write('Creating matrix (%s tuples)\n' % len(self._data)) try: self._matrix.create(self._data.get()) except AttributeError: self._matrix.create(self._data) if VERBOSE: sys.stdout.write("Matrix density is: %s%%\n" % self._matrix.density()) self._matrix_and_data_aligned = True def compute(self, min_values=None): if self._matrix.empty() and (not isinstance(self._data, list) and not self._data.get()): raise ValueError('No data set. Matrix is empty!') if self._matrix.empty() and (isinstance(self._data, list) and not self._data): raise ValueError('No data set. Matrix is empty!') if not self._matrix.empty() or not self._matrix_and_data_aligned: self.create_matrix() if min_values: if VERBOSE: sys.stdout.write('Updating matrix: squish to at least %s values\n' % min_values) self._matrix.set(self._matrix.get().squish(min_values)) def _get_row_similarity(self, i): if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None: self.compute() try: return self.get_matrix_similarity().get_row(i) except KeyError: raise KeyError("%s not found!" % i) def similar(self, i, n=10): """ :param i: a row in *M* :type i: user or item id :param n: number of similar elements :type n: int :returns: the most similar elements of *i* """ if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None: self.compute() return self._get_row_similarity(i).top_items(n) def similarity(self, i, j): """ :param i: a row in *M* :type i: user or item id :param j: a row in *M* :type j: user or item id :returns: the similarity between the two elements *i* and *j* """ if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None: self.compute() return self.get_matrix_similarity().value(i, j) def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None): raise NotImplementedError("cannot instantiate Abstract Base Class") def recommend(self, i, n=10): raise NotImplementedError("cannot instantiate Abstract Base Class") ### OTHER METHODS ### def _cosine(self, v1, v2): return float(divisi2.dot(v1,v2) / (norm(v1) * norm(v2))) def centroid(self, ids, are_rows=True): if VERBOSE: sys.stdout.write('Computing centroid for ids=%s\n' % str(ids)) points = [] for id in ids: if are_rows: point = self.get_matrix().get_row(id) else: point = self.get_matrix().get_col(id) points.append(point) M = divisi2.SparseMatrix(points) return M.col_op(sum)/len(points) #TODO numpy.sum seems slower? def _kinit(self, X, k): #Init k seeds according to kmeans++ n = X.shape[0] #Choose the 1st seed randomly, and store D(x)^2 in D[] centers = [X[randint(0, n-1)]] D = [norm(x-centers[0])**2 for x in X] for _ in range(k-1): bestDsum = bestIdx = -1 for i in range(n): #Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2) Dsum = reduce(lambda x,y:x+y, (min(D[j], norm(X[j]-X[i])**2) for j in xrange(n))) if bestDsum < 0 or Dsum < bestDsum: bestDsum, bestIdx = Dsum, i centers.append(X[bestIdx]) D = [min(D[i], norm(X[i]-X[bestIdx])**2) for i in xrange(n)] return array(centers) def kmeans(self, id, k=5, is_row=True): """ K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering Clusterizes the (cols) values of a given row, or viceversa :param id: row (or col) id to cluster its values :param k: number of clusters :param is_row: is param *id* a row (or a col)? :type is_row: Boolean """ # TODO: switch to Pycluster? # http://pypi.python.org/pypi/Pycluster if VERBOSE: sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id)) point = None if is_row: point = self.get_matrix().get_row(id) else: point = self.get_matrix().get_col(id) points = [] points_id = [] for i in point.nonzero_entries(): label = point.label(i) points_id.append(label) if not is_row: points.append(self.get_matrix().get_row(label)) else: points.append(self.get_matrix().get_col(label)) #return kmeans(array(points), k) if VERBOSE: sys.stdout.write('id %s has %s points\n' % (id, len(points))) M = array(points) MAX_POINTS = 150 # Only apply Matrix initialization if num. points is not that big! if len(points) <= MAX_POINTS: centers = self._kinit(array(points), k) centroids, labels = kmeans2(M, centers, minit='matrix') else: centroids, labels = kmeans2(M, k, minit='random') i = 0 clusters = dict() for cluster in labels: if not clusters.has_key(cluster): clusters[cluster] = dict() clusters[cluster]['centroid'] = centroids[cluster] clusters[cluster]['points'] = [] clusters[cluster]['points'].append(points_id[i]) i += 1 return clusters
import sqlite3 import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD from recsys.evaluation.prediction import RMSE, MAE from recsys.datamodel.data import Data from recsys.datamodel.item import Item from recsys.datamodel.user import User data = Data() data.load("../data/ratings.tsv", sep='|', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) K = 100 svd = SVD() svd.set_data(data) svd.compute(k=K, min_values=0.1, pre_normalize=None, mean_center=True, post_normalize=True) [(beers[b].get_data()['name'], b, val) for b, val in svd.similar(1502, 50)\
import sys #To show some messages: import recsys.algorithm #recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data from recsys.evaluation.prediction import RMSE, MAE from recsys.evaluation.decision import PrecisionRecallF1 from recsys.evaluation.ranking import SpearmanRho, KendallTau #Dataset PERCENT_TRAIN = 70 data = Data() data.load('./data/dataset-recsys.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #Train & Test data train, test = data.split_train_test(percent=PERCENT_TRAIN) #Create SVD K = 100 svd = SVD() svd.set_data(train)
def to_sparse_matrix(self, sep='\t', format=None): # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_ST.html data = Data() data.load(self._data_file, sep=sep, format=format) f = open(self._matrix_file, 'w') f_row_ids = codecs.open('%s.ids.rows' % self._svd_prefix, 'w', 'utf8') f_col_ids = codecs.open('%s.ids.cols' % self._svd_prefix, 'w', 'utf8') num_rows = len(set(map(itemgetter(1), data))) num_cols = len(set(map(itemgetter(2), data))) non_zero = len(data) f.write('%s %s %s\n' % (num_rows, num_cols, non_zero)) #print 'sorting data by col' l = data.get() #l.sort(key=itemgetter(2, 1)) #by col, and then row l.sort(key=itemgetter(2)) rows = dict() cols = dict() prev_col_id = None col_values = [] row, col = (0, 0) for value, row_id, col_id in l: #if not row_id or not col_id or not value: # if VERBOSE: # sys.stdout.write('Skipping: %s, %s, %s\n' % (value, row_id, col_id)) # continue if col_id != prev_col_id: if col_values: f.write('%s\n' % len(col_values)) for col_row_id, col_value in col_values: _row = rows[col_row_id] f.write('%s %s\n' % (_row, col_value)) col_values = [] cols[col_id] = col col += 1 if not rows.has_key(row_id): rows[row_id] = row row += 1 col_values.append((row_id, value)) prev_col_id = col_id if col_values: f.write('%s\n' % len(col_values)) for col_row_id, col_value in col_values: row = rows[col_row_id] f.write('%s %s\n' % (row, col_value)) cols[col_id] = col f.close() # Now write f_row_ids and f_col_ids rows = rows.items() rows.sort(key=itemgetter(1)) for row_id, _ in rows: if row_id == '': continue if isinstance(row_id, int): row_id = str(row_id) f_row_ids.write(row_id + '\n') f_row_ids.close() cols = cols.items() cols.sort(key=itemgetter(1)) for col_id, _ in cols: if col_id == '': continue if isinstance(col_id, int): col_id = str(col_id) f_col_ids.write(col_id + '\n') f_col_ids.close()
def test_utf8_data(): data_in = Data() NUM_PLAYS = 69 ITEMID = u'Bj\xf6rk' data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1]) NUM_PLAYS = 34 ITEMID = 'Björk' data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2]) data_in.save(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8')) data_saved = Data() data_saved.load( os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8')) assert_equal(len(data_in), len(data_saved))
#Load a dataset svd = SVD() svd.load_data(filename='./data/ratings.dat', sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) #Haciendo el split al dataset filename = './data/ratings.dat' data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} data.load(filename, sep='::', format=format) train_80, test_20 = data.split_train_test(percent=80) # 80% train, 20% test svd = SVD() svd.set_data(train_80) #Ingresando variables para crear la matrizx k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) k = 100
#To show some messages: import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD, SVDNeighbourhood from recsys.datamodel.data import Data from recsys.evaluation.prediction import RMSE, MAE # Create SVD K = 100 svd = SVD() svd_neig = SVDNeighbourhood() #Dataset PERCENT_TRAIN = int(sys.argv[2]) data = Data() data.load(sys.argv[1], sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) rmse_svd_all = [] mae_svd_all = [] rmse_svd_neig_all = [] mae_svd_neig_all = [] RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
choice = input("Enter your choice: ") return choice if __name__ == "__main__": # Load data from custom path try: data_path = sys.argv[1] except IndexError: data_path = '/data' #Load data ratings = Data() if os.path.isfile(data_path + '/myratings.data'): ratings.load(data_path + '/myratings.data') else: try: ratings = load_ratings(data_path + '/ratings.csv') except IOError: raise Exception('Data not found. Please specify it.' % data_path) movies = load_movies(data_path + '/movies.csv') tags = load_tags(data_path + '/tags.csv') os.system('clear') print """ ##################################################### #### COMMAND LINE RECOMMENDER ####
# 3.4: def recommend(user_id, top_n): #[(item,value),(item1, value1)...] recommendations = [] for i in itemdict.keys(): if (int(i) not in items_reviewed(int(user_id), userdict)): recommendations.append( (i, predict_rating(user_id, i))) #only get those not predicted. recommendations.sort(key=lambda t: t[1], reverse=True) return recommendations[:top_n] #3.3: data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'} # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file # 'ids': int -> Ids (row and col ids) are integers (not strings) data.load(dat_file, sep='::', format=format) similarity_matrix = SimilarityMatrix() recommend(0, 10) recommend(1, 10) recommend(2, 10) ################## #Now we do SVD ##################
}, "dino": { "women", "games", "xbox", "x-men", "assassin's creed", "pop", "rap", "opera", "need for speed", "jeans" }, "priya": { "heart", "mountaineering", "sky diving", "sony", "apple", "pop", "perfumes", "luxury", "eminem", "lil wayne" }, "brenda": { "cute guys", "xbox", "shower", "beach", "summer", "english", "french", "country music", "office", "birds" } } data = Data() VALUE = 1.0 for username in likes: for user_likes in likes[username]: data.add_tuple((VALUE, username, user_likes)) # Tuple format is: <value, row, column> svd = SVD() svd.set_data(data) k = 5 # Usually, in a real dataset, you should set a higher number, e.g. 100 svd.compute(k=k, min_values=3, pre_normalize=None, mean_center=False, post_normalize=True)
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data filename = "./data/ratings.dat" data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} # About format parameter: # 'row': 1 -> Rows in matrix come from second column in ratings.dat file # 'col': 0 -> Cols in matrix come from first column in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file # 'ids': int -> Ids (row and col ids) are integers (not strings) data.load(filename, sep="::", format=format) train, test = data.split_train_test(percent=80) # 80% train ,20%test svd = SVD() svd.set_data(train) print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0)) # the prediction for user loving item print(svd.recommend(1, n=10, only_unknowns=True, is_row=False)) #item recomended for user ,only from known print(svd.recommend(1, n=10, only_unknowns=False, is_row=False)) #item recomended for user
class RecommendSystem(object): def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE() def get_data(self): """ 获取数据 :return: None """ # 如果模型不存在 if not os.path.exists(tmpfile): # 如果数据文件不存在 if not os.path.exists(self.filename): sys.exit() # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) # 使用Data()来获取数据 self.data.load(self.filename, sep=self.sep, format=self.format) train, test = self.data.split_train_test(percent=80) return train, test else: self.svd.load_model(tmpfile) self.is_load = True return None, None def train(self, train): """ 训练模型 :param train: 训练数据 :return: None """ if not self.is_load: self.svd.set_data(train) self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=tmpfile[:-4]) return None def rs_predict(self, itemid, userid): """ 评分预测 :param itemid: 电影id :param userid: 用户id :return: None """ score = self.svd.predict(itemid, userid) print "推荐的分数为:%f" % score return score def recommend_to_user(self, userid): """ 推荐给用户 :param userid: 用户id :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 读取文件里的电影名称 movie_list = [] for line in open(moviefile, "r"): movie_list.append(' '.join(line.split("::")[1:2])) # 推荐具体电影名字和分数 for itemid, rate in recommend_list: print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate) return None def evaluation(self, test): """ 模型的评估 :param test: 测试集 :return: None """ # 如果模型不是直接加载 if not self.is_load: # 循环取出测试集里面的元组数据<评分,电影,用户> for value, itemid, userid in test.get(): try: predict = self.rs_predict(itemid, userid) self.rmse.add(value, predict) except KeyError: continue # 计算返回误差(均方误差) error = self.rmse.compute() print "模型误差为%s:" % error return None