Example #1
0
class Algorithm(object):
    def __init__(self):
        self._data = Data()

    def __repr__(self):
        s = '%d rows.' % len(self.get_data())
        if len(self.get_data()):
            s += '\nE.g: %s' % str(self.get_data()[0])
        return s

    def __len__(self):
        return len(self.get_data())

    def get_data(self):
        return self._data

    def set_data(self, data):
        self._data = data

    def add_tuple(self, tuple):
        self.get_data().add_tuple(tuple)

    def load_data(self, filename, sep='\t', format={'value':0, 'row':1, 'col':2}):
        self._data.load_file(filename, sep, format)

    def compute(self):
        if not self._data.get():
            raise ValueError('No data set. Matrix is empty!')
def get_preference(user_List):
    #generate list of users
    
    preference_dict={}
    user_map={}
    data = Data() #saving rating data
    i=1
    for user in user_List:
        user_id=(str(user))
        url = "http://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?\
key=147CBF377C6B648EC3DC73499CE73D32&steamid="+user+"&format=json"
        response = urllib2.urlopen(url)
        owned_gameData = json.loads(response.read().decode('utf-8-sig'))
        user_Pref={}
        #print (user)
        try: 
            if owned_gameData['response']['game_count']!=0:
                user_Pref={}
                for games in owned_gameData['response']['games']:
                    if games['playtime_forever']>0:
                        user_Pref[games['appid']]= math.log(games['playtime_forever'])
                        data.add_tuple((math.log(games['playtime_forever'], 10), games['appid'], i))
                        user_map[i]=user
        except:
            continue
        i=i+1
        preference_dict[user]=user_Pref
    data.save('rating.dat')
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Example #4
0
def train_and_save(filename):

    step = filename.split('.')[-1]

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}
    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Already exists: svdn_model_{step}.zip'.format(step=step))

    except:

        svd = SVD()
        svd.set_data(train)

        svd.compute(
            k=100,
            min_values=2,
            pre_normalize=False,
            mean_center=True,
            post_normalize=True,
            savefile='svdn_model_{step}'.format(step=step)
        )

        print('Saved svdn_model_{step}.zip'.format(step=step))
Example #5
0
 def _convert_hash(self, dataset):
     data = Data()
     for key in dataset:
         record = dataset[key]
         batch = [(record[k], key, k) for k in record]
         data.set(batch, extend=True)
     return data
Example #6
0
	def build_model(self,uids,kn):
		data = Data()
		for uid,songs in uids.items():
			for song in songs:
				data.add_tuple((1,song,uid))
		svd = SVD()
		svd.set_data(data)
		svd.compute(k=kn,min_values=1)
		self.model = svd
def test_data_extend():
    dataset = [(1,2,3), (4,5,6)]
    dataset2 = [(7,8,9), (10,11,12)]
    data = Data()
    data.set(dataset)
    assert_equal(len(data), 2)

    data.set(dataset2, extend=True)
    assert_equal(len(data), 4)
Example #8
0
def load_ratings(filename):
    """ Load ratings
    """
    
    data = Data()
    format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}
    data.load(filename, sep=',', format=format)
    
    return data
def getAverageRating(ITEMID):
    averageRating = 0
    totalUsers = 0
    data = Data()
    data.load('./data/movielens/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
    for rating, item_id, user_id in data.get():
        if(item_id == ITEMID):
            totalUsers += 1
            averageRating += rating
    print averageRating/totalUsers
def get_data_model_matrix(data):
    """
    This method process raw data and store rating/users/movies in a matrix <value/row/column> respectively
    using recsys library
    :return: data object (recsys.datamodel.Data()) )
    """
    processed_data = Data()
    for user, review in data.items():
        for mov, rat in review.items():
            processed_data.add_tuple((rat, user, mov))
    return processed_data
Example #11
0
def calculate_stats_features(pct_train):
    dat_file='feature_matrix.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False,
    post_normalize=False)
    return svd,train,test
Example #12
0
    def setup_svd(self, vote_list):
        if self.svd is None:
            self.cache['svd'] = SVD()
            data = Data()

            for vote in vote_list:
                user_id = vote[0].id
                item_id = vote[1]
                value = float(vote[2])
                data.add_tuple((value, item_id, user_id))  # Tuple format is: <value, row, column>
            self.cache['svd'].set_data(data)
            self.cache['svd'].compute(k=self.k, min_values=1)
        return self.svd
Example #13
0
def get_friend_matrix(u_ids, raw_data):
    idata = Data()
    u_idx = 0
    for u_id in u_ids:
        u_idx += 1
        i_idx = 0
        i_ids = raw_data[u_id].keys()
        for i_id in i_ids:
            i_idx += 1
            rate, ts = raw_data[u_id][i_id]
            idata.add_tuple((float(rate),u_idx,i_idx))

    return idata
Example #14
0
def prepare_data(raw_data):
    idata = Data()
    u_idx = 0
    for u_id in raw_data.keys():
        i_idx = 0
        u_idx += 1
        pre_u_raw_data = raw_data[u_id]
        for i_id in pre_u_raw_data.keys():
            i_idx += 1
            rate, _ = pre_u_raw_data[i_id]
            idata.add_tuple((float(rate),u_idx,i_idx))

    return idata
Example #15
0
def test_save_n_load(percent_train,
         modelKlass = SVD,
         dataFname ='/Users/jennyyuejin/recommender/Data/movieData/u.data',
         dataFormat = {'col':0, 'row':1, 'value':2, 'ids':int}):

    data = Data()
    data.load(dataFname, sep='\t', format=dataFormat)

    print '------ evaluating original'
    train, test = data.split_train_test(percent=percent_train, shuffle_data=False)
    print len(train), 'training data points;', len(test), 'testing data points'

    #Create SVD
    K=100
    svd = modelKlass()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)
    evaluate(svd, test)

    svd.save_model('./model/svd.obj.zip',
                   {'k': K, 'min_values': 5,
                    'pre_normalize': None, 'mean_center': True, 'post_normalize': True})


    print '------ evaluating copy'
    data2 = Data()
    data2.load(dataFname, sep='\t', format=dataFormat)
    _, test2 = data2.split_train_test(percent=percent_train, shuffle_data=False)   # reload data
    print len(test2), 'testing data points'

    svd_pred = modelKlass()
    svd_pred.load_model('./model/svd.obj.zip')

    evaluate(svd_pred, test2)
Example #16
0
 def setUp(self):
     
     data = Data()
     for stars, item_id, user_id in ratings:
         data.add_tuple((stars, item_id, user_id))
     
     movies = dict()
     for mid, name, genres in movie_genres:
         movie = Item(mid)
         movie.add_data({'name': name, 'genres': genres})
         movies[mid] = movie
     
     self.ratings = data
     self.movies = movies
Example #17
0
def read_user_data_from_ratings(data_file):
    data = Data()
    format = {'col':0, 'row':1, 'value':2, 'ids': 'int'}    
    data.load(dat_file, sep='::', format=format)
    
    userdict = {}
    for d in data.get():
        if d[2] in userdict:
            user = userdict[d[2]] 
        else:
            user = User(d[2]) 
        
        user.add_item(d[1],d[0])
        userdict[d[2]] = user
    return userdict
Example #18
0
    def update(self, USER_ID, baseline, path, pred_items):
        print "Loading tweet occurrences pickle..."
        baseline.get_data()._load_pickle(path=path + "tweet_occurrences.p")
        tweet_occurrences = baseline.get_data().get()

        print "Loading count_dict pickle..."
        count_dict = cPickle.load(open(path + "count_dict.p"))

        print "Loading occurrences pickle..."
        occurrences = cPickle.load(open(path + "occurrences.p"))

        total_count = count_dict[USER_ID]
        upd_total_count = int(total_count) + len(pred_items)
        count_dict[USER_ID] = int(upd_total_count)

        print "Dumping count_dict pickle..."
        cPickle.dump(count_dict, open(path + "count_dict.p", "wb"), 2)

        print "Updating counts for known artists..."
        for index, (count, item_id, user_id) in enumerate(tweet_occurrences):
            if str(user_id).encode('utf-8') == USER_ID:
                item_id = str(item_id).encode('utf-8')
                count = occurrences[(item_id, USER_ID)]
                upd_count = float(count) / float(upd_total_count)

                occurrences[(item_id, USER_ID)] = float(upd_count)
                baseline._matrix.set_value(item_id, USER_ID, float(upd_count))
                tweet_occurrences[index] = (float(upd_count), item_id, user_id)

        print "Updating counts for recommended artists..."
        for item_id, relevance in pred_items:
            count = (1.0 / float(upd_total_count))
            baseline._matrix.set_value(item_id, USER_ID, float(count))
            occurrences[(item_id, USER_ID)] = float(count)
            tweet_occurrences.append((float(count), item_id, USER_ID))

        print "Dumping tweet occurrences pickle..."
        data_tweet_occurrences = Data()
        data_tweet_occurrences.set(tweet_occurrences)

        baseline.set_data(data_tweet_occurrences)
        baseline.save_data(filename=path + "tweet_occurrences.p", pickle=True)

        print "Dumping occurrence pickle..."
        cPickle.dump(occurrences, open(path + "occurrences.p", "wb"), protocol=2)

        print "Dumping sparse matrix pickle..."
        cPickle.dump(baseline._matrix.get(), open(path + "sparse_matrix.p", "w"), protocol=2)
Example #19
0
def get_movie(movie_id):
	movie = {}
	rating = 0
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM movies WHERE movie_id = ?", (movie_id,))
		movie_result = cur.fetchone()
		cur.execute("SELECT director FROM movie_directors WHERE movie_id = ?", (movie_id,))
		directors = cur.fetchall()
		cur.execute("SELECT actor FROM movie_actors WHERE movie_id = ?", (movie_id,))
		actors = cur.fetchall()
		cur.execute("SELECT writer FROM movie_writers WHERE movie_id = ?", (movie_id,))
		writers = cur.fetchall()
		cur.execute("SELECT genre FROM movie_genres WHERE movie_id = ?", (movie_id,))
		genres = cur.fetchall()
		if 'session_user' in request.cookies:
			cur.execute("SELECT * FROM ratings WHERE user_id = ? AND movie_id = ?", (request.get_cookie('session_user', secret='recsys')[0], movie_id,))
			rating = cur.fetchone()
		cur.execute("SELECT * FROM ratings")
		rating_results = cur.fetchall()
		d = Data()
		d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
		svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
		svd.set_data(d)
		similar_list = [str(s[0]) for s in svd.similar(int(movie_id))]
		cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(similar_list)))
		similar_movies = cur.fetchall()
		movie = {
			'mid': movie_result[0],
			'title': movie_result[1],
			'description': movie_result[2],
			'image': movie_result[3],
			'year': movie_result[4],
			'directors': [d[0] for d in directors],
			'writers': [w[0] for w in writers],
			'actors': [a[0] for a in actors],
			'genres': [g[0] for g in genres],
			'rating': rating,
			'similar_movies': similar_movies,
		}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/movie.html', movie=movie, session_user=session_user)
def build_svd_item_based(user_op_item_cnt, item_op_users, user_idx, item_idx, min_nonzero):
    svd = SVD()
    data = Data()
    item_lst = []
    for ui in user_op_item_cnt:
        if len(user_op_item_cnt[ui]) < min_nonzero:
            continue
        for ti in user_op_item_cnt[ui]:
            if item_op_users[ti] < min_nonzero:
                continue
            if 1.0*user_op_item_cnt[ui][ti] < 1:
                continue
            item_lst.append(ti)
            data.add_tuple(((1.0*user_op_item_cnt[ui][ti]), item_idx[ti], user_idx[ui]))
    item_lst = list(set(item_lst))
    svd.set_data(data)
    return svd, item_lst
def test_utf8_data():
    data_in = Data()

    NUM_PLAYS = 69
    ITEMID = u'Bj\xf6rk' 
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1])

    NUM_PLAYS = 34
    ITEMID = 'Björk' 
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2])

    data_in.save(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    data_saved = Data()
    data_saved.load(os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    assert_equal(len(data_in), len(data_saved))
Example #22
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
def build_svd_cat_based(user_op_cat_cnt, cat_op_users, user_idx, cat_idx, min_nonzero):
    svd = SVD()
    data = Data()
    cat_lst = []
    for ui in user_op_cat_cnt:
        if len(user_op_cat_cnt[ui]) < min_nonzero:
            continue
        for ci in user_op_cat_cnt[ui]:
            if cat_op_users[ci] < min_nonzero:
                continue
            if 1.0*user_op_cat_cnt[ui][ci] < 1:
                continue
            cat_lst.append(ci)
            data.add_tuple(((1.0*user_op_cat_cnt[ui][ci]), cat_idx[ci], user_idx[ui]))
    cat_lst = list(set(cat_lst))
    print 'cat =', len(cat_lst)
    svd.set_data(data)
    return svd, cat_lst
Example #24
0
def similar_users(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    return [i[0] for i in svd.similar(user)]
Example #25
0
def get_feeds():
	movielist = {}
	with sqlite3.connect('data/data100.db') as con:
		cur = con.cursor()
		cur.execute("SELECT * FROM ratings WHERE user_id = ?", (request.get_cookie('session_user', secret='recsys')[0],))
		if cur.fetchone():
			cur.execute("SELECT ratings, movie_id, user_id FROM ratings")
			rating_results = cur.fetchall()
			d = Data()
			d.set(rating_results)
			# with open('data/tmp.dat', 'a') as f:
			# 	for l in rating_results:
			# 		f.write('%d,%d,%d\n' % (l[0], l[1], l[2]))
			svd = SVD()
			# svd.load_data(filename='data/tmp.dat', sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids':int})
			svd.set_data(d)
			recommendations = [str(s[0]) for s in svd.recommend(request.get_cookie('session_user', secret='recsys')[0], is_row=False)]
			cur.execute("SELECT * FROM movies WHERE movie_id IN (%s)" % (', '.join(recommendations)))
			similar_movies = cur.fetchall()
			for m in similar_movies:
				movielist[m] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
		else:
			cur.execute("SELECT * FROM movies")
			movies = cur.fetchall()
			for m in movies:
				cur.execute("SELECT AVG(ratings) FROM ratings WHERE movie_id = ?", (m[0],))
				avg = cur.fetchone()[0]
				movielist[avg] = {
					'mid': m[0],
					'title': m[1],
					'description': m[2],
					'image': m[3],
					'year': m[4]
				}
	session_user = request.get_cookie('session_user', secret='recsys') if 'session_user' in request.cookies else None
	return template('static/feeds.html', movielist=movielist, session_user=session_user)
Example #26
0
    def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False):
        """
        Loads a dataset file

        See params definition in *datamodel.Data.load()*
        """
        if force:
            self._data = Data()
            self._matrix_similarity = None

        self._data.load(filename, force, sep, format, pickle)
Example #27
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
def parse_data():
	filename = '../data/ml-1m/ratings.dat'
	data = Data()
	format = {'col':0, 'row':1, 'value':2, 'ids': int}
	data.load(filename, sep='::', format=format)
	train, test = data.split_train_test(percent=80) # 80% train, 20% test
	data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)
Example #29
0
File: svd.py Project: niminjie/iptv
def main():
    svd = SVD()
    train = Data()
    test = Data()
    train.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    test.load('randUser/rate1.csv', force=True, sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int})
    svd.set_data(train)
    svd.compute(k=100, min_values=0.5, pre_normalize=False, mean_center=True, post_normalize=True)

    # rmse = RMSE()
    # mae = MAE()
    # for rating, item_id, user_id in test.get():
    #     try:
    #         pred_rating = svd.predict(item_id, user_id)
    #         rmse.add(rating, pred_rating)
    #         mae.add(rating, pred_rating)
    #     except KeyError:
    #         continue
    # print 'RMSE=%s' % rmse.compute()
    # print 'MAE=%s' % mae.compute()

    # test = make_test()
    # print precision_and_recall(test, svd)
    # rec_list = svd.recommend(200, n=5, only_unknowns=False, is_row=False)
    print svd.recommend(1, n=5, only_unknowns=False, is_row=False)
Example #30
0
def recommended_files(user):
    if not type(user) is str:
        user = unidecode.unidecode(user)
    if db.done_users.find_one({'user':user})['recommended']==False:
        user_files = db.user_list.find({'user':user})
        f = open('./dc_recom.dat','a')
        for u in user_files:
            f.write(u['user'] + '::' + u['tth'])
            f.write('\n')
        f.close()
        db.done_users.update({'user': user}, {'user':user, 'recommended': True})

    data = Data()
    data.load('./dc_recom.dat', sep='::', format={'col':1,'row':0})
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user,n=10)]

    newdata = Data()
    for i in range(0,len(similar_users),1):
        files = db.user_list.find({'user':similar_users[i]})
        for f in files:
            newdata.add_tuple((1.0,similar_users[i],f['tth']))
    svd.set_data(newdata)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=100)

    res = []
    c_res = 0
    for p in recoms:
        flag=0
        for r in res:
            if similar(db.tths.find_one({'tth':p[0]})['name'],db.tths.find_one({'tth':r[0]})['name']):
                flag = 1
                break
        if flag == 0:
            res.append(p)
            c_res += 1
            if c_res > 10:
                k = []
                for i in res:
                    try:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
                    except:
                        j = 'magnet:?xt=urn:tree:tiger:'+i[0]
                    k.append(j)
                return k
    k = []
    for i in res:
        try:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0] + "&dn=" + unidecode.unidecode(db.tths.find_one({'tth': i[0]})['name'])
        except:
            j = 'magnet:?xt=urn:tree:tiger:'+i[0]
        k.append(j)

    return k
Example #31
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'),
        (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')]

d = Data()
d.set(data)
svd = SVD()
svd.set_data(d)
m = svd.get_matrix()
svd.compute(k=2)
print svd.similar('user1')
print svd.predict('user1', 'item1')
Example #32
0
class Collaborative_filtering(object):
    def __init__(self, ratings_file,
                 movies):  #No need to pass as ,will be provided in views.py
        #self.users = users
        self.movies = movies
        self.K = 100
        self.PERCENT_TRAIN = 85
        #Need to provide a default file location for ratings.csv instead of loading everytime.run below 2lines only once
        #or just provide this file instead.
        #self.users.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index= False)
        self.ratings_file = ratings_file  #Give your path to ratings.csv created from above 2 lines.
        self.data = None
        self.svd = None
        self.recommend_movies_list = None
        self.recommend_movies_ids = None
        self.similar_movies_list = None
        self.similar_movies_ids = None

        self.movie_id = None
        self.train = None
        self.test = None

    def compute_svd(self):
        '''    
        ratings = pd.read_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index_col= False)
        ratings = ratings.ix[1:]
        ratings.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index = False)
        self.data = Data()      
        self.data.load(self.ratings_file, sep=',', format={'col':0, 'row':1 ,'value':2, 'ids':float})
        self.train , self.test = self.data.split_train_test(percent=self.PERCENT_TRAIN)    
        self.svd = SVD()
        self.svd.set_data(self.train)    
        self.svd.compute(k=self.K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)'''
        self.data = Data()
        self.data.load(self.ratings_file,
                       sep=',',
                       format={
                           'col': 0,
                           'row': 1,
                           'value': 2,
                           'ids': float
                       })
        self.train, self.test = self.data.split_train_test(percent=85)
        self.svd = SVDNeighbourhood()
        self.svd.set_data(self.train)
        self.svd.compute(k=100,
                         min_values=1,
                         pre_normalize=None,
                         mean_center=False,
                         post_normalize=True)

    def similarity_measure(
            self, movie1,
            movie2):  #gives a similarity measure value between -1 to 1
        return round(self.svd.similarity(movie1, movie2), 4)

    def recommend_movies(self, user_id):
        l = self.svd.recommend(user_id, n=10, only_unknowns=True, is_row=False)
        self.recommend_movies_list = []
        self.recommend_movies_ids = []
        for p in l:
            #movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.recommend_movies_list.append(bb)
            #movie ids
            gg = movies.ix[movies['movie_id'] == p[0]]
            gg = gg.reset_index()
            del gg['index']
            gg = gg.ix[:, 0:2].as_matrix(columns=None).tolist()
            self.recommend_movies_ids.append(gg[0][0])
        return self.recommend_movies_list, self.recommend_movies_ids

    def get_similar_movies(self,
                           movie1):  #Returns a PYTHON list for similar movies.
        movie1 = int(movie1)
        l = self.svd.similar(movie1)
        self.similar_movies_list = []
        self.similar_movies_ids = []
        l = l[1:]

        for p in l:
            #getting movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.similar_movies_list.append(bb)
            #getting movie id's
            self.similar_movies_ids.append(p[0])

        return self.similar_movies_list, self.similar_movies_ids
Example #33
0
class Algorithm(object):
    """
    Base class Algorithm

    It has the basic methods to load a dataset, get the matrix and the raw input
    data, add more data (tuples), etc.

    Any other Algorithm derives from this base class
    """
    def __init__(self):
        self._data = Data()
        self._matrix = SparseMatrix()
        self._matrix_similarity = None #self-similarity matrix (only for the input Matrix rows)
        self._matrix_and_data_aligned = False #both Matrix and Data contain the same info?

    def __len__(self):
        return len(self.get_data())

    def __repr__(self):
        s = '%d rows.' % len(self.get_data())
        if len(self.get_data()):
            s += '\nE.g: %s' % str(self.get_data()[0])
        return s

    def get_matrix(self):
        """
        :returns: matrix *M*
        """
        if not self._matrix.get():
            self.create_matrix()
        return self._matrix

    def get_matrix_similarity(self):
        """
        :returns: the self-similarity matrix
        """
        return self._matrix_similarity

    def set_data(self, data):
        """
        Sets the raw dataset (input for matrix *M*)

        :param data: a Dataset class (list of tuples <value, row, col>)
        :type data: Data
        """
        #self._data = Data()
        #self._data.set(data)
        self._data = data
        self._matrix_and_data_aligned = False

    def get_data(self):
        """
        :returns: An instance of Data class. The raw dataset (input for matrix *M*). 
        """
        return self._data

    def add_tuple(self, tuple):
        """
        Add a tuple in the dataset

        :param tuple: a tuple containing <rating, user, item> information. Or, more general: <value, row, col>
        """
        self.get_data().add_tuple(tuple)
        self._matrix_and_data_aligned = False

    def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False):
        """
        Loads a dataset file

        See params definition in *datamodel.Data.load()*
        """
        if force:
            self._data = Data()
            self._matrix_similarity = None

        self._data.load(filename, force, sep, format, pickle)
    
    def save_data(self, filename, pickle=False):
        """
        Saves the dataset in divisi2 matrix format (i.e: value <tab> row <tab> col)

        :param filename: file to store the data
        :type filename: string
        :param pickle: save in pickle format?
        :type filename: boolean
        """
        self._data.save(filename, pickle)

    def create_matrix(self):
        if VERBOSE:
            sys.stdout.write('Creating matrix (%s tuples)\n' % len(self._data))
        try:
            self._matrix.create(self._data.get())
        except AttributeError:
            self._matrix.create(self._data)

        if VERBOSE:
            sys.stdout.write("Matrix density is: %s%%\n" % self._matrix.density())
        self._matrix_and_data_aligned = True

    def compute(self, min_values=None):
        if self._matrix.empty() and (not isinstance(self._data, list) and not self._data.get()):
            raise ValueError('No data set. Matrix is empty!')
        if self._matrix.empty() and (isinstance(self._data, list) and not self._data):
            raise ValueError('No data set. Matrix is empty!')
        if not self._matrix.empty() or not self._matrix_and_data_aligned:
            self.create_matrix()

        if min_values:
            if VERBOSE:
                sys.stdout.write('Updating matrix: squish to at least %s values\n' % min_values)
            self._matrix.set(self._matrix.get().squish(min_values))

    def _get_row_similarity(self, i):
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        try:
            return self.get_matrix_similarity().get_row(i)
        except KeyError:
            raise KeyError("%s not found!" % i)

    def similar(self, i, n=10):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param n: number of similar elements
        :type n: int
        :returns: the most similar elements of *i*
        """
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        return self._get_row_similarity(i).top_items(n)

    def similarity(self, i, j):
        """
        :param i: a row in *M*
        :type i: user or item id
        :param j: a row in *M*
        :type j: user or item id
        :returns: the similarity between the two elements *i* and *j*
        """
        if not self.get_matrix_similarity() or self.get_matrix_similarity().get() is None:
            self.compute()
        return self.get_matrix_similarity().value(i, j)

    def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    def recommend(self, i, n=10):
        raise NotImplementedError("cannot instantiate Abstract Base Class")

    ### OTHER METHODS ###
    def _cosine(self, v1, v2):
        return float(divisi2.dot(v1,v2) / (norm(v1) * norm(v2)))

    def centroid(self, ids, are_rows=True):
        if VERBOSE:
            sys.stdout.write('Computing centroid for ids=%s\n' % str(ids))
        points = []
        for id in ids:
            if are_rows:
                point = self.get_matrix().get_row(id)
            else:
                point = self.get_matrix().get_col(id)
            points.append(point)
        M = divisi2.SparseMatrix(points)
        return M.col_op(sum)/len(points) #TODO numpy.sum seems slower?

    def _kinit(self, X, k):
        #Init k seeds according to kmeans++
        n = X.shape[0]
        #Choose the 1st seed randomly, and store D(x)^2 in D[]
        centers = [X[randint(0, n-1)]]
        D = [norm(x-centers[0])**2 for x in X]

        for _ in range(k-1):
            bestDsum = bestIdx = -1
            for i in range(n):
                #Dsum = sum_{x in X} min(D(x)^2,||x-xi||^2)
                Dsum = reduce(lambda x,y:x+y,
                              (min(D[j], norm(X[j]-X[i])**2) for j in xrange(n)))
                if bestDsum < 0 or Dsum < bestDsum:
                    bestDsum, bestIdx = Dsum, i
            centers.append(X[bestIdx])
            D = [min(D[i], norm(X[i]-X[bestIdx])**2) for i in xrange(n)]
        return array(centers)

    def kmeans(self, id, k=5, is_row=True):
        """
        K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering

        Clusterizes the (cols) values of a given row, or viceversa

        :param id: row (or col) id to cluster its values
        :param k: number of clusters
        :param is_row: is param *id* a row (or a col)?
        :type is_row: Boolean
        """
        # TODO: switch to Pycluster?
        # http://pypi.python.org/pypi/Pycluster
        if VERBOSE:
            sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id))
        point = None
        if is_row:
            point = self.get_matrix().get_row(id)
        else:
            point = self.get_matrix().get_col(id)
        points = []
        points_id = []
        for i in point.nonzero_entries():
            label = point.label(i)
            points_id.append(label)
            if not is_row:
                points.append(self.get_matrix().get_row(label))
            else:
                points.append(self.get_matrix().get_col(label))
        #return kmeans(array(points), k)
        if VERBOSE:
            sys.stdout.write('id %s has %s points\n' % (id, len(points)))
        M = array(points)

        MAX_POINTS = 150
        # Only apply Matrix initialization if num. points is not that big!
        if len(points) <= MAX_POINTS:
            centers = self._kinit(array(points), k)
            centroids, labels = kmeans2(M, centers, minit='matrix')
        else:
            centroids, labels = kmeans2(M, k, minit='random')
        i = 0
        clusters = dict()
        for cluster in labels:
            if not clusters.has_key(cluster): 
                clusters[cluster] = dict()
                clusters[cluster]['centroid'] = centroids[cluster]
                clusters[cluster]['points'] = []
            clusters[cluster]['points'].append(points_id[i])
            i += 1
        return clusters
Example #34
0
import sqlite3
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.evaluation.prediction import RMSE, MAE

from recsys.datamodel.data import Data
from recsys.datamodel.item import Item
from recsys.datamodel.user import User

data = Data()
data.load("../data/ratings.tsv",
          sep='|',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': float
          })

K = 100
svd = SVD()
svd.set_data(data)
svd.compute(k=K,
            min_values=0.1,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

[(beers[b].get_data()['name'], b, val) for b, val in  svd.similar(1502, 50)\
Example #35
0
import sys

#To show some messages:
import recsys.algorithm
#recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data
from recsys.evaluation.prediction import RMSE, MAE
from recsys.evaluation.decision import PrecisionRecallF1
from recsys.evaluation.ranking import SpearmanRho, KendallTau

#Dataset
PERCENT_TRAIN = 70
data = Data()
data.load('./data/dataset-recsys.csv',
          sep=',',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })

#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

#Create SVD
K = 100
svd = SVD()
svd.set_data(train)
Example #36
0
    def to_sparse_matrix(self, sep='\t', format=None):
        # http://tedlab.mit.edu/~dr/SVDLIBC/SVD_F_ST.html
        data = Data()
        data.load(self._data_file, sep=sep, format=format)

        f = open(self._matrix_file, 'w')
        f_row_ids = codecs.open('%s.ids.rows' % self._svd_prefix, 'w', 'utf8')
        f_col_ids = codecs.open('%s.ids.cols' % self._svd_prefix, 'w', 'utf8')

        num_rows = len(set(map(itemgetter(1), data)))
        num_cols = len(set(map(itemgetter(2), data)))
        non_zero = len(data)
        f.write('%s %s %s\n' % (num_rows, num_cols, non_zero))

        #print 'sorting data by col'
        l = data.get()
        #l.sort(key=itemgetter(2, 1)) #by col, and then row
        l.sort(key=itemgetter(2))

        rows = dict()
        cols = dict()
        prev_col_id = None
        col_values = []
        row, col = (0, 0)
        for value, row_id, col_id in l:
            #if not row_id or not col_id or not value:
            #    if VERBOSE:
            #        sys.stdout.write('Skipping: %s, %s, %s\n' % (value, row_id, col_id))
            #    continue
            if col_id != prev_col_id:
                if col_values:
                    f.write('%s\n' % len(col_values))
                    for col_row_id, col_value in col_values:
                        _row = rows[col_row_id]
                        f.write('%s %s\n' % (_row, col_value))
                col_values = []
                cols[col_id] = col
                col += 1
            if not rows.has_key(row_id):
                rows[row_id] = row
                row += 1
            col_values.append((row_id, value))
            prev_col_id = col_id
        if col_values:
            f.write('%s\n' % len(col_values))
            for col_row_id, col_value in col_values:
                row = rows[col_row_id]
                f.write('%s %s\n' % (row, col_value))
            cols[col_id] = col
        f.close()

        # Now write f_row_ids and f_col_ids
        rows = rows.items()
        rows.sort(key=itemgetter(1))
        for row_id, _ in rows:
            if row_id == '':
                continue
            if isinstance(row_id, int):
                row_id = str(row_id)
            f_row_ids.write(row_id + '\n')
        f_row_ids.close()

        cols = cols.items()
        cols.sort(key=itemgetter(1))
        for col_id, _ in cols:
            if col_id == '':
                continue
            if isinstance(col_id, int):
                col_id = str(col_id)
            f_col_ids.write(col_id + '\n')
        f_col_ids.close()
def test_utf8_data():
    data_in = Data()

    NUM_PLAYS = 69
    ITEMID = u'Bj\xf6rk'
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID1])

    NUM_PLAYS = 34
    ITEMID = 'Björk'
    data_in.add_tuple([NUM_PLAYS, ITEMID, USERID2])

    data_in.save(os.path.join(MOVIELENS_DATA_PATH,
                              'ratings.matrix.saved.utf8'))

    data_saved = Data()
    data_saved.load(
        os.path.join(MOVIELENS_DATA_PATH, 'ratings.matrix.saved.utf8'))

    assert_equal(len(data_in), len(data_saved))
Example #38
0
#Load a dataset

svd = SVD()
svd.load_data(filename='./data/ratings.dat',
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })

#Haciendo el split al dataset
filename = './data/ratings.dat'
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
data.load(filename, sep='::', format=format)
train_80, test_20 = data.split_train_test(percent=80)  # 80% train, 20% test
svd = SVD()
svd.set_data(train_80)

#Ingresando  variables para crear la matrizx
k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

k = 100
Example #39
0
#To show some messages:
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD, SVDNeighbourhood
from recsys.datamodel.data import Data
from recsys.evaluation.prediction import RMSE, MAE

# Create SVD
K = 100
svd = SVD()
svd_neig = SVDNeighbourhood()

#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1],
          sep='::',
          format={
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })

rmse_svd_all = []
mae_svd_all = []
rmse_svd_neig_all = []
mae_svd_neig_all = []

RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Example #40
0
    choice = input("Enter your choice: ")
    
    return choice


if __name__ == "__main__":
    
    # Load data from custom path
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = '/data'
    
    
    #Load data
    ratings = Data()
    if os.path.isfile(data_path + '/myratings.data'):
        ratings.load(data_path + '/myratings.data')
    else:
        try:
            ratings = load_ratings(data_path + '/ratings.csv')
        except IOError:
            raise Exception('Data not found. Please specify it.'
                            % data_path)
    movies = load_movies(data_path + '/movies.csv')
    tags = load_tags(data_path + '/tags.csv')
            
    os.system('clear')
    print """
#####################################################
####           COMMAND LINE RECOMMENDER          ####
Example #41
0
# 3.4:
def recommend(user_id, top_n):
    #[(item,value),(item1, value1)...]
    recommendations = []
    for i in itemdict.keys():
        if (int(i) not in items_reviewed(int(user_id), userdict)):
            recommendations.append(
                (i, predict_rating(user_id,
                                   i)))  #only get those not predicted.
    recommendations.sort(key=lambda t: t[1], reverse=True)
    return recommendations[:top_n]


#3.3:
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': 'int'}
# About format parameter:
#   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(dat_file, sep='::', format=format)

similarity_matrix = SimilarityMatrix()
recommend(0, 10)
recommend(1, 10)
recommend(2, 10)

##################
#Now we do SVD
##################
    },
    "dino": {
        "women", "games", "xbox", "x-men", "assassin's creed", "pop", "rap",
        "opera", "need for speed", "jeans"
    },
    "priya": {
        "heart", "mountaineering", "sky diving", "sony", "apple", "pop",
        "perfumes", "luxury", "eminem", "lil wayne"
    },
    "brenda": {
        "cute guys", "xbox", "shower", "beach", "summer", "english", "french",
        "country music", "office", "birds"
    }
}

data = Data()
VALUE = 1.0
for username in likes:
    for user_likes in likes[username]:
        data.add_tuple((VALUE, username,
                        user_likes))  # Tuple format is: <value, row, column>

svd = SVD()
svd.set_data(data)
k = 5  # Usually, in a real dataset, you should set a higher number, e.g. 100
svd.compute(k=k,
            min_values=3,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True)
Example #43
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

filename = "./data/ratings.dat"
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
# About format parameter:
#   'row': 1 -> Rows in matrix come from second column in ratings.dat file
#   'col': 0 -> Cols in matrix come from first column in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(filename, sep="::", format=format)
train, test = data.split_train_test(percent=80)  # 80% train ,20%test

svd = SVD()
svd.set_data(train)

print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0))
# the prediction for user loving item
print(svd.recommend(1, n=10, only_unknowns=True, is_row=False))
#item recomended for user ,only from known
print(svd.recommend(1, n=10, only_unknowns=False, is_row=False))
#item recomended for user
Example #44
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None