def test_save_n_load(percent_train,
         modelKlass = SVD,
         dataFname ='/Users/jennyyuejin/recommender/Data/movieData/u.data',
         dataFormat = {'col':0, 'row':1, 'value':2, 'ids':int}):

    data = Data()
    data.load(dataFname, sep='\t', format=dataFormat)

    print '------ evaluating original'
    train, test = data.split_train_test(percent=percent_train, shuffle_data=False)
    print len(train), 'training data points;', len(test), 'testing data points'

    #Create SVD
    K=100
    svd = modelKlass()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)
    evaluate(svd, test)

    svd.save_model('./model/svd.obj.zip',
                   {'k': K, 'min_values': 5,
                    'pre_normalize': None, 'mean_center': True, 'post_normalize': True})


    print '------ evaluating copy'
    data2 = Data()
    data2.load(dataFname, sep='\t', format=dataFormat)
    _, test2 = data2.split_train_test(percent=percent_train, shuffle_data=False)   # reload data
    print len(test2), 'testing data points'

    svd_pred = modelKlass()
    svd_pred.load_model('./model/svd.obj.zip')

    evaluate(svd_pred, test2)
Example #2
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
def parse_data():
	filename = '../data/ml-1m/ratings.dat'
	data = Data()
	format = {'col':0, 'row':1, 'value':2, 'ids': int}
	data.load(filename, sep='::', format=format)
	train, test = data.split_train_test(percent=80) # 80% train, 20% test
	data.save(os.path.join(utils.get_add_dir(), 'ratings'), pickle=True)
Example #4
0
def train_and_save(filename):

    step = filename.split('.')[-1]

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}
    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Already exists: svdn_model_{step}.zip'.format(step=step))

    except:

        svd = SVD()
        svd.set_data(train)

        svd.compute(
            k=100,
            min_values=2,
            pre_normalize=False,
            mean_center=True,
            post_normalize=True,
            savefile='svdn_model_{step}'.format(step=step)
        )

        print('Saved svdn_model_{step}.zip'.format(step=step))
Example #5
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Example #6
0
def calculate_stats_features(pct_train):
    dat_file='feature_matrix.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=0, pre_normalize=None, mean_center=False,
    post_normalize=False)
    return svd,train,test
Example #7
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Example #8
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Example #9
0
def calculate_stats_features(pct_train):
    dat_file = 'feature_matrix.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=0,
                pre_normalize=None,
                mean_center=False,
                post_normalize=False)
    return svd, train, test
Example #10
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Example #11
0
def ex1(dat_file='ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,
    'ids':int})
        # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(
        k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Example #12
0
File: day_07.py Project: lmlzk/ML
    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None
from recsys.evaluation.prediction import RMSE, MAE
from recsys.datamodel.data import Data

from baseline import Baseline #Import the test class we've just created
import time
start_time = time.time()
#rmsem = []
#for k in range(1, 11):
#    print str(k)+" fold..."
#Dataset
dat_file='ratings_user.csv'

data = Data()
data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2})
train, test = data.split_train_test(percent=80)

print train
print test

################ kNN ################
train_item = {}
train_user = {}
for rating, item_id, user_id in train:
    if item_id in train_item:
        train_item[item_id][user_id] = rating
    else:
        train_item[item_id] = {user_id: rating}
    if user_id in train_user:
        train_user[user_id][item_id] = rating
    else:
Example #14
0
class Collaborative_filtering(object):
    def __init__(self, ratings_file,
                 movies):  #No need to pass as ,will be provided in views.py
        #self.users = users
        self.movies = movies
        self.K = 100
        self.PERCENT_TRAIN = 85
        #Need to provide a default file location for ratings.csv instead of loading everytime.run below 2lines only once
        #or just provide this file instead.
        #self.users.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index= False)
        self.ratings_file = ratings_file  #Give your path to ratings.csv created from above 2 lines.
        self.data = None
        self.svd = None
        self.recommend_movies_list = None
        self.recommend_movies_ids = None
        self.similar_movies_list = None
        self.similar_movies_ids = None

        self.movie_id = None
        self.train = None
        self.test = None

    def compute_svd(self):
        '''    
        ratings = pd.read_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index_col= False)
        ratings = ratings.ix[1:]
        ratings.to_csv("/home/sourabhkondapaka/Desktop/ratingsss.csv",index = False)
        self.data = Data()      
        self.data.load(self.ratings_file, sep=',', format={'col':0, 'row':1 ,'value':2, 'ids':float})
        self.train , self.test = self.data.split_train_test(percent=self.PERCENT_TRAIN)    
        self.svd = SVD()
        self.svd.set_data(self.train)    
        self.svd.compute(k=self.K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True)'''
        self.data = Data()
        self.data.load(self.ratings_file,
                       sep=',',
                       format={
                           'col': 0,
                           'row': 1,
                           'value': 2,
                           'ids': float
                       })
        self.train, self.test = self.data.split_train_test(percent=85)
        self.svd = SVDNeighbourhood()
        self.svd.set_data(self.train)
        self.svd.compute(k=100,
                         min_values=1,
                         pre_normalize=None,
                         mean_center=False,
                         post_normalize=True)

    def similarity_measure(
            self, movie1,
            movie2):  #gives a similarity measure value between -1 to 1
        return round(self.svd.similarity(movie1, movie2), 4)

    def recommend_movies(self, user_id):
        l = self.svd.recommend(user_id, n=10, only_unknowns=True, is_row=False)
        self.recommend_movies_list = []
        self.recommend_movies_ids = []
        for p in l:
            #movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.recommend_movies_list.append(bb)
            #movie ids
            gg = movies.ix[movies['movie_id'] == p[0]]
            gg = gg.reset_index()
            del gg['index']
            gg = gg.ix[:, 0:2].as_matrix(columns=None).tolist()
            self.recommend_movies_ids.append(gg[0][0])
        return self.recommend_movies_list, self.recommend_movies_ids

    def get_similar_movies(self,
                           movie1):  #Returns a PYTHON list for similar movies.
        movie1 = int(movie1)
        l = self.svd.similar(movie1)
        self.similar_movies_list = []
        self.similar_movies_ids = []
        l = l[1:]

        for p in l:
            #getting movie names
            bb = str(movies.ix[movies['movie_id'] == p[0]]['title']).split()
            q = bb.index('Name:')
            bb = ' '.join(bb[1:q])
            self.similar_movies_list.append(bb)
            #getting movie id's
            self.similar_movies_ids.append(p[0])

        return self.similar_movies_list, self.similar_movies_ids
Example #15
0
#To show some messages:
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data
from recsys.utils.svdlibc import SVDLIBC
from recsys.evaluation.prediction import RMSE, MAE

#Dataset
PERCENT_TRAIN = int(sys.argv[2])
data = Data()
data.load(sys.argv[1], sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})
#Train & Test data
train, test = data.split_train_test(percent=PERCENT_TRAIN)

svdlibc = SVDLIBC('./ml-1m/ratings.dat')
svdlibc.to_sparse_matrix(sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})
svdlibc.compute(k=100)
svd = svdlibc.export()
svd.save_model('/tmp/svd-model', options={'k': 100})
#svd.similar(ITEMID1) # results might be different than example 4. as there's no min_values=10 set here


#Evaluation using prediction-based metrics
print 'Evaluating...'
rmse = RMSE()
mae = MAE()
for rating, item_id, user_id in test.get():
    try:
Example #16
0
svd = SVD()
svd.load_data(filename='./data/ratings.dat',
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })

#Haciendo el split al dataset
filename = './data/ratings.dat'
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
data.load(filename, sep='::', format=format)
train_80, test_20 = data.split_train_test(percent=80)  # 80% train, 20% test
svd = SVD()
svd.set_data(train_80)

#Ingresando  variables para crear la matrizx
k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)

k = 100
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
Example #17
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None
Example #18
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

filename = "./data/ratings.dat"
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
# About format parameter:
#   'row': 1 -> Rows in matrix come from second column in ratings.dat file
#   'col': 0 -> Cols in matrix come from first column in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(filename, sep="::", format=format)
train, test = data.split_train_test(percent=80)  # 80% train ,20%test

svd = SVD()
svd.set_data(train)

print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0))
# the prediction for user loving item
print(svd.recommend(1, n=10, only_unknowns=True, is_row=False))
#item recomended for user ,only from known
print(svd.recommend(1, n=10, only_unknowns=False, is_row=False))
#item recomended for user
Example #19
0
    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()


if __name__ == '__main__':

    #Dataset
    PERCENT_TRAIN = 100
    data = Data()
    data.load('/Users/jennyyuejin/recommender/Data/test_0/userProd.data',
              sep='\t',
              format={'col':0, 'row':1, 'value':2, 'ids':int})

    #Train & Test data
    train, test = data.split_train_test(percent=PERCENT_TRAIN, shuffle_data=True)
    print len(train), 'training data points;', len(test), 'testing data points'

    itemId = 0
    item = Item(itemId)
    item.add_data({'name': 'project0',
                   'popularity': 0.5,
                   'tags': [0, 0, 1]
    })

    itemId = 1
    item2 = Item(itemId)
    item2.add_data({'name': 'project1',
                   'popularity': 0.9,
                   'tags': [0, 0, 1]
    })
Example #20
0
              'col': 0,
              'row': 1,
              'value': 2,
              'ids': int
          })

rmse_svd_all = []
mae_svd_all = []
rmse_svd_neig_all = []
mae_svd_neig_all = []

RUNS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for run in RUNS:
    print 'RUN(%d)' % run
    #Train & Test data
    train, test = data.split_train_test(percent=PERCENT_TRAIN)

    svd.set_data(train)
    svd_neig.set_data(train)

    #Compute SVD
    svd.compute(k=K,
                min_values=None,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)
    svd_neig.compute(k=K,
                     min_values=None,
                     pre_normalize=None,
                     mean_center=True,
                     post_normalize=True)