Ejemplo n.º 1
0
def ex1(dat_file='./ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int})
       

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K=100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Ejemplo n.º 2
0
def evaluate(data, count=5, K=100):
    results = []

    for i in range(count):
        train, test = data.split_train_test(percent=PERCENT_TRAIN)
        print len(data.get()), len(train.get()), len(test.get())
        #test_in_train(test, train)
        #print train.get()
        svd = SVD()
        svd.set_data(train)
        svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

        #Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in test.get():
            try:
                pred_rating = svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                #print "keyerror: ===========================================================>"
                continue
        try:
            rsu = {}
            rsu["RMSE"] = rmse.compute()
            rsu["MAE"] = mae.compute()
            print rsu
            results.append(rsu)
        except:
            print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++"
        

    return results
Ejemplo n.º 3
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file,
              sep=',',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    train, test = data.split_train_test(percent=pct_train)
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100,
                min_values=2,
                pre_normalize=None,
                mean_center=True,
                post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Ejemplo n.º 4
0
def recommended_files(data,user):
    svd = SVD()
    svd.set_data(data)
    svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True)
    similar_users = [i[0] for i in svd.similar(user)]
    
    #recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=50)
    predict_arr = []

    user_tths = db.user_list.find({'user':user})
    tths = [i['tth'] for i in user_tths]
    movie_names = []
    
    for i in similar_users[1:]:
        for j in db.user_list.find({'user':i}):
            if j['tth'] not in tths:
                movie_name = db.tths.find_one({'tth':j['tth']})['name']
                movie_names.append(movie_name)               
                tths.append(j['tth'])   
                predict_arr.append((movie_name,j['tth'],svd.predict(user,j['tth'])))
    
    predict_arr = sorted(predict_arr,key=lambda x:x[2],reverse=True)
    res = []
    c_res = 0
    for p in predict_arr:
        flag=0
        for r in res:                
            if similar(p[0],r[0]):
                flag = 1
                break
        if flag == 0:
            res.append(p[1])
            c_res += 1
            if c_res > 10:
                return res
Ejemplo n.º 5
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = DATA_DIR + 'ml-1m-ratings.dat'
    svd.load_data(filename=dat_file,
                  sep='::',
                  format={
                      'col': 0,
                      'row': 1,
                      'value': 2,
                      'ids': int
                  })

    # compute svd
    k = 100
    svd.compute(k=k,
                min_values=10,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1  # toy story
    ITEMID2 = 1221  # godfather II

    # get movies similar to toy story
    print svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating for user1 and item1, mapped onto min max
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    print 'which users should see Toy Story?:'
    print svd.recommend(ITEMID)
Ejemplo n.º 6
0
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5):

    data = Data()
    data.load(dat_file,
              sep='::',
              format={
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': int
              })
    # About format parameter:
    #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
    #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
    #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
    #   file
    #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=K,
                min_values=5,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    # mae is mean ABSOLUTE error
    # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Ejemplo n.º 7
0
def get_mae_rmse(step):

    data = Data()

    format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'}

    filename = 'second_train_test.dat.{step}'.format(step=step)

    data.load(filename, sep='::', format=format)

    train, test = data.split_train_test(percent=80)

    try:

        svd = SVD('svdn_model_{step}.zip'.format(step=step))
        print('Loading model... {step}'.format(step=step))

    except:

        return

    mae_predicted, rmse_predicted = [], []
    for rating, item_id, user_id in test:
        try:

            predicted = svd.predict(item_id, user_id)

            mae_predicted.append((rating, predicted))
            rmse_predicted.append((rating, predicted))

        except:

            pass

    mae_value, rmse_value = np.nan, np.nan

    if len(mae_predicted) > 0:
        mae = MAE(mae_predicted)
        mae_value = mae.compute()

    if len(rmse_predicted) > 0:
        rmse = RMSE(rmse_predicted)
        rmse_value = rmse.compute()

    return mae_value, rmse_value
Ejemplo n.º 8
0
def evaulte(train_set, test_set):
    svd = SVD()
    svd.set_data(train_set)
    svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True)

    mae = MAE()
    k_err = 0
    for rating, item_id, user_id in test_set.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            mae.add(rating, pred_rating)
        except KeyError:
            #print "keyerror: ===========================================================>"
            k_err += 1
            continue
    
    print "k_err", k_err, " -- ", "test-len: ", len(test_set.get()), "train-len: ", len(train_set.get())
    result = mae.compute()/2.0
    return result
Ejemplo n.º 9
0
def calculate_stats_users(pct_train):
    dat_file = 'user_data_working.csv'
    data = Data()
    data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int})
    train, test = data.split_train_test(percent=pct_train)               
    svd = SVD()
    svd.set_data(train)
    svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True,
    post_normalize=False)
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():      
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s\n' % mae.compute()
Ejemplo n.º 10
0
def ex1(dat_file='ml-1m/ratings.dat',
        pct_train=0.5):

    data = Data()
    data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,
    'ids':int})
        # About format parameter:
        #   'row': 1 -> Rows in matrix come from column 1 in ratings.dat file
        #   'col': 0 -> Cols in matrix come from column 0 in ratings.dat file
        #   'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat
        #   file
        #   'ids': int -> Ids (row and col ids) are integers (not strings)

    # create train/test split
    train, test = data.split_train_test(percent=pct_train)

    # create svd
    K = 100
    svd = SVD()
    svd.set_data(train)
    svd.compute(
        k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)

    # evaluate performance
    rmse = RMSE()
    mae = MAE()
    for rating, item_id, user_id in test.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            rmse.add(rating, pred_rating)
            mae.add(rating, pred_rating)
        except KeyError:
            continue

    print 'RMSE=%s' % rmse.compute()
    print 'MAE=%s' % mae.compute()
Ejemplo n.º 11
0
def quickstart():
    svd = SVD()
    recsys.algorithm.VERBOSE = True

    # load movielens data
    dat_file = 'ml-1m/ratings.dat'
    svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int})

    # compute svd
    k = 100
    svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True,
        post_normalize=True)

    pdb.set_trace()

    # movie id's
    ITEMID1 = 1      # toy story
    ITEMID2 = 1221   # godfather II

    # get movies similar to toy story
    svd.similar(ITEMID1)

    # get predicted rating for given user & movie
    MIN_RATING = 0.0
    MAX_RATING = 5.0
    USERID = 1
    ITEMID = 1

    # get predicted rating
    pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
    actual = svd.get_matrix().value(ITEMID, USERID)
    print 'predicted rating = {0}'.format(pred)
    print 'actual rating = {0}'.format(actual)

    # which users should see Toy Story?
    svd.recommend(ITEMID)
Ejemplo n.º 12
0
def evaulte(train_set, test_set):
    svd = SVD()
    svd.set_data(train_set)
    svd.compute(k=KKK,
                min_values=MIN_ITEM,
                pre_normalize=None,
                mean_center=True,
                post_normalize=True)

    mae = MAE()
    k_err = 0
    for rating, item_id, user_id in test_set.get():
        try:
            pred_rating = svd.predict(item_id, user_id)
            mae.add(rating, pred_rating)
        except KeyError:
            #print "keyerror: ===========================================================>"
            k_err += 1
            continue

    print "k_err", k_err, " -- ", "test-len: ", len(
        test_set.get()), "train-len: ", len(train_set.get())
    result = mae.compute() / 2.0
    return result
Ejemplo n.º 13
0
                  'ids': str
              })
k = 200
svd.compute(k=k, savefile='../tmp/weight')

svd2 = SVD(filename='../tmp/weight')  # Loading already computed SVD model

output_path = "./output.txt"
output_file = open(output_path, 'w')
validate_file = file("../validate_nolabel.txt")
line = validate_file.readline()
line = validate_file.readline().strip("\r\n")

while line:
    question_id = line.split(',')[0]
    user_id = line.split(',')[1]
    try:
        predict = svd2.predict(user_id, question_id, 0.0, 1.0)
    except:
        predict = 0
        print question_id + "," + user_id + "      Exception"

    if predict > 1.0: predict = 1.0
    if predict < 0.0001: predict = 0.0
    result = question_id + "," + user_id + "," + str(predict)
    #print result

    output_file.write(result)
    output_file.write("\n")
    line = validate_file.readline().strip("\r\n")
Ejemplo n.º 14
0
    print(json.dumps(similaries, ensure_ascii=False))

# import pdb;pdb.set_trace()
import sys
sys.exit(0)

print(svd.similar(ITEMID1))

# Returns: <ITEMID, Cosine Similarity Value>

MIN_RATING = 0.0
MAX_RATING = 1.0
ITEMID = 109
USERID = 3837663637323963363639393565373833613237396534393132376338386362

print('testing..')
print(svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING))
# Predicted value 5.0

print(svd.get_matrix().value(ITEMID, USERID))

# Real value 5.0

# Recommend (non-rated) movies to a user:
print('recommend to user')
print(svd.recommend(USERID, is_row=False)) #cols are users and rows are items, thus we set is_row=False

print(svd.recommend(ITEMID))

import pdb;pdb.set_trace()
Ejemplo n.º 15
0
#svd.set_data(train)

#假设奇异值的个数为100
k = 100
svd.compute(k=k,
            min_values=1,
            pre_normalize=None,
            mean_center=False,
            post_normalize=True)
#svd.compute(k=k,min_values=10,pre_normalize=None,mean_center=True,post_normalize=True,savefile='/tmp/movielens')

#你可以计算两个电影的相似度
ITEMID1 = 3
ITEMID2 = 3
#svd.similarity(ITEMID1,ITEMID2)

print svd.similar(ITEMID1, ITEMID2)

#或者得到类似的电影
print svd.similar(ITEMID1)

#再预测一下用户对电影的评分
MIN_RATING = 1.0
MAX_RATING = 5.0
USERID1 = 30
print svd.predict(ITEMID1, USERID1, MIN_RATING, MAX_RATING)

#重头戏,推荐电影给用户!
print svd.recommend(USERID1, is_row=False)
#谁应该看这部电影
print svd.recommend(ITEMID1)
Ejemplo n.º 16
0
    # Compute SVD
    svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True)
    svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True)

    # Evaluate
    rmse_svd = RMSE()
    mae_svd = MAE()
    rmse_svd_neig = RMSE()
    mae_svd_neig = MAE()

    i = 1
    total = len(test.get())
    print "Total Test ratings: %s" % total
    for rating, item_id, user_id in test:
        try:
            pred_rating_svd = svd.predict(item_id, user_id)
            rmse_svd.add(rating, pred_rating_svd)
            mae_svd.add(rating, pred_rating_svd)

            pred_rating_svd_neig = svd_neig.predict(item_id, user_id)  # Koren & co.
            if pred_rating_svd_neig is not nan:
                rmse_svd_neig.add(rating, pred_rating_svd_neig)
                mae_svd_neig.add(rating, pred_rating_svd_neig)

            print "\rProcessed test rating %d" % i,
            sys.stdout.flush()

            i += 1
        except KeyError:
            continue
Ejemplo n.º 17
0
            pre_normalize=None,
            mean_center=True,
            post_normalize=True)
# predicted_rating = svd.predict(int(5), 'A1', 1, 10)
# predicted_rating2 = svd.predict(int(1), 'A1', 1, 10)

# print('Predicted rating', predicted_rating)
# print('Predicted rating', predicted_rating2)

records = ETLUtils.load_csv_file(file_name_header, '|')
errors = []

for record in records:
    try:
        # print(record['user'], record['item'], record['rating'])
        user = record['user']
        item = int(record['item'])
        predicted_rating = svd.predict(item, user, 1, 5)
        print(record['user'], record['item'], predicted_rating)
        # predicted_rating = round(predicted_rating)
        actual_rating = svd.get_matrix().value(item, user)
        error = abs(predicted_rating - actual_rating)
        errors.append(error)
    except KeyError:
        continue

mean_absolute_error = MeanAbsoluteError.compute_list(errors)
root_mean_square_error = RootMeanSquareError.compute_list(errors)
print('Mean Absolute error: %f' % mean_absolute_error)
print('Root mean square error: %f' % root_mean_square_error)
Ejemplo n.º 18
0
            post_normalize=True)
#svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True)
#svd.compute(k=K, pre_normalize=None, mean_center=True, post_normalize=True)

print ''
print 'COMPUTING SIMILARITY'
print svd.similarity(1, 2)  # similarity between items
print svd.similar(1, 5)  # show 5 similar items

print ''
print 'GENERATING PREDICTION'
MIN_RATING = 0.0
MAX_RATING = 5.0
ITEMID = 1
USERID = 1
print svd.predict(ITEMID, USERID, MIN_RATING,
                  MAX_RATING)  # predicted rating value
print svd.get_matrix().value(ITEMID, USERID)  # real rating value

print ''
print 'GENERATING RECOMMENDATION'
print svd.recommend(USERID, n=5, only_unknowns=True, is_row=False)

#Evaluation using prediction-based metrics
rmse = RMSE()
mae = MAE()
spearman = SpearmanRho()
kendall = KendallTau()
#decision = PrecisionRecallF1()
for rating, item_id, user_id in test.get():
    try:
        pred_rating = svd.predict(item_id, user_id)
Ejemplo n.º 19
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'),
        (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')]

d = Data()
d.set(data)
svd = SVD()
svd.set_data(d)
m = svd.get_matrix()
svd.compute(k=2)
print svd.similar('user1')
print svd.predict('user1', 'item1')
Ejemplo n.º 20
0
 (3114, 0.87060391051018071), # Toy Story 2
 (2355, 0.67706936677315799), # A bug's life
 (588,  0.5807351496754426),  # Aladdin
 (595,  0.46031829709743477), # Beauty and the Beast
 (1907, 0.44589398718134365), # Mulan
 (364,  0.42908159895574161), # The Lion King
 (2081, 0.42566581277820803), # The Little Mermaid
 (3396, 0.42474056361935913), # The Muppet Movie
 (2761, 0.40439361857585354)] # The Iron Giant

 MIN_RATING = 0.0
MAX_RATING = 5.0
ITEMID = 1
USERID = 1

svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
# Predicted value 5.0

svd.get_matrix().value(ITEMID, USERID)
# Real value 5.0

svd.recommend(USERID, is_row=False) #cols are users and rows are items, thus we set is_row=False

# Returns: <ITEMID, Predicted Rating>
[(2905, 5.2133848204673416), # Shaggy D.A., The
 (318,  5.2052108435956033), # Shawshank Redemption, The
 (2019, 5.1037438278755474), # Seven Samurai (The Magnificent Seven)
 (1178, 5.0962756861447023), # Paths of Glory (1957)
 (904,  5.0771405690055724), # Rear Window (1954)
 (1250, 5.0744156653222436), # Bridge on the River Kwai, The
 (858,  5.0650911066862907), # Godfather, The
Ejemplo n.º 21
0
Archivo: day_07.py Proyecto: lmlzk/ML
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        # 文件信息
        self.filename = filename
        self.sep = sep
        self.format = format

        # 初始化矩阵分解
        self.svd = SVD()

        # 矩阵信息
        self.k = 100  #  矩阵的隐因子睡昂
        self.min_values = 10  #  删除评分少于10人的电影
        self.post_normalize = False

        # 设置是否加载模型标志
        self.load_model = False

        # 初始化均方误差
        self.rmse = RMSE()

    def get_data(self):
        # 如果模型不存在,则需要加载数据
        if not os.path.exists(filename):
            if not os.path.exists(self.filename):
                sys.exit()
            # SVD加载数据
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            data = Data()

            data.load(self.filename, sep=self.sep, format=self.format)

            # 分割数据集
            train, test = data.split_train_test(percent=80)

            return train, test

        else:
            # 直接加载模型
            self.svd.load_model(filename)

            # 将是否加载模型设为True
            self.load_model = True

            return None, None

    def train(self, train):
        """
        训练数据
        :param train: 训练集
        :return:
        """
        if not self.load_model:
            # svd去获取训练数据集
            self.svd.set_data(train)
            # 注意传入的文件名字,不是带后缀名
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=filename[:-4])
        return None

    def recommend_to_user(self, userid):
        """
        推荐结果
        :param usrid: 用于ID
        :return: None
        """

        recommend_list = self.svd.recommend(userid, is_row=False)

        # 打印电影的名称,和预测的评分

        # 构建电影名字的列表
        movies_list = []

        for line in open("./data/ml-1m/movies.dat", "r"):
            movies_list.append(' '.join(line.split("::")[1:2]))

        # 依次取出推荐ID
        for itemid, rating in recommend_list:

            print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating)

        return None

    def rs_predict(self, userid, itemid):
        """
        得出评分
        :param userid: 用户ID
        :param itemid: 物品ID
        :return: 评分
        """
        score = self.svd.predict(itemid, userid)

        return score

    def evaluation(self, test):
        """
        均方误差评估模型
        :param test: 测试数据
        :return: None
        """
        if not self.load_model:
            # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)>
            for rating, itemid, userid in test.get():
                try:
                    # rating真是值
                    score = self.rs_predict(userid, itemid)

                    # 添加所有的测试数据
                    self.rmse.add(rating, score)
                except KeyError:
                    continue

            error = self.rmse.compute()

            print "均方误差为:%s" % error

        return None
Ejemplo n.º 22
0
class Recommender:
    def __init__(self, datafile_path=None):
        self.svd = SVD()
        self.matrix = None
        self.datafile_path = datafile_path
        self.predict_matrix = None
        self.load_local_data(self.datafile_path, 100, 0)

    def load_web_data(self, filename, film_names_with_rate_list, K, min_values,
                  MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None):
        self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\
            create_matrix_by_film_titles(film_names_with_rate_list)
        self.matrix.save_rating_matrix_as_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def load_local_data(self, filename, K, min_values):
        self.matrix = rm.MatrixCreator().restore_from_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0):
        if K:
            self.__compute_matrix(K)

        self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map)))
        for user in self.matrix.users_indexes_map.keys():
            for film in self.matrix.films_indexes_map.keys():
                user_index = self.matrix.users_indexes_map[user]
                film_index = self.matrix.films_indexes_map[film]
                self.predict_matrix[user_index][film_index] = self.svd.predict(user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate)
        return self.predict_matrix


    def predict_for_user(self, user_index, min_rate=1, max_rate=10, top = None, repeat=False, K=None, min_values=None):
        """
        :param K: to change the number of properties
        :return: {Film : int(rate), ...} or
                [(Film, int(rate)), ...] if top is not None
        """
        if K:
            self.__compute_matrix(K)

        prediction = {}
        np_matrix = self.matrix.get_rating_matrix()
        for index in xrange(np_matrix.shape[1]):
            rate = self.svd.predict(user_index, index,
                                    MIN_VALUE=min_rate,
                                    MAX_VALUE=max_rate)
            film = self.matrix.indexes_films_map[index]
            prediction[film] = rate

        if not repeat:
            fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0]
            user = self.matrix.indexes_users_map[fake_user_index]
            films = user.get_preferences().keys()

            prediction = [(x, prediction[x]) for x in prediction if x not in films]

        if top:
            prediction = sorted(prediction.items(), key=operator.itemgetter(1))
            prediction = list(reversed(prediction[-top:]))

        return prediction

    def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0):
        """
        :param K: to change the number of properties
        :return: [{Film : int(rate), ...}, ...]
        """
        if K:
            self.__compute_matrix(K)

        predictions = []

        for user_index in self.matrix.indexes_with_fake_user_ids.keys():
            prediction = self.predict_for_user(user_index, min_rate, max_rate, top)
            predictions.append(prediction)

        return predictions

    def predicted_rating_submatrix(self, user_indexes):
        self.__compute_matrix(100)
        predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int)
        for index in user_indexes:
            row = []
            for film_index in xrange(self.matrix.rating_matrix.shape[1]):
                row.append(self.svd.predict(index, film_index,
                                    MIN_VALUE=1,
                                    MAX_VALUE=10))

            predicted = np.append(predicted, [row], axis=0)
        return predicted[1:]

    def predicted_rating_submatrix_for_fake(self):
        return self.predicted_rating_submatrix(self.matrix.indexes_with_fake_user_ids.keys())

    def __compute_matrix(self, K,
                         min_values=0,
                         pre_normalize=None,
                         mean_center=True,
                         post_normalize=True):
        self.svd.load_data(self.datafile_path, sep=' ', format={'col': 1, 'row': 0, 'value': 2, 'ids': int})
        self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None)

    def filter_films_data(self, min_user_votes):
        film_indexes = []
        counter = collections.Counter()
        with open(self.datafile_path, 'rb') as my_file:
            r = csv.reader(my_file)
            for row in r:
                user_index, film_index, rate = row[0].split(' ')
                counter[int(film_index)] += 1

            for k, v in counter.iteritems():
                if v < min_user_votes:
                    film_indexes.append(k)

        copyfile(self.datafile_path+'_user_map', self.datafile_path+'_'+str(min_user_votes)+'_user_map')

        new_indexes = {}
        with open(self.datafile_path+'_film_map', 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path+'_'+str(min_user_votes)+'_film_map', 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                index = 0
                for row in r:
                    film_index, film_id = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    new_indexes[film_index] = index
                    wr.writerow([index, film_id])
                    index += 1

        with open(self.datafile_path, 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path+'_'+str(min_user_votes), 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                for row in r:
                    user_index, film_index, rate = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    wr.writerow([user_index, new_indexes[film_index], rate])
Ejemplo n.º 23
0
k = 100
svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True)
# predicted_rating = svd.predict(int(5), 'A1', 1, 10)
# predicted_rating2 = svd.predict(int(1), 'A1', 1, 10)

# print('Predicted rating', predicted_rating)
# print('Predicted rating', predicted_rating2)

records = ETLUtils.load_csv_file(file_name_header, '|')
errors = []

for record in records:
    try:
        # print(record['user'], record['item'], record['rating'])
        user = record['user']
        item = int(record['item'])
        predicted_rating = svd.predict(item, user, 1, 5)
        print(record['user'], record['item'], predicted_rating)
        # predicted_rating = round(predicted_rating)
        actual_rating = svd.get_matrix().value(item, user)
        error = abs(predicted_rating - actual_rating)
        errors.append(error)
    except KeyError:
        continue

mean_absolute_error = MeanAbsoluteError.compute_list(errors)
root_mean_square_error = RootMeanSquareError.compute_list(errors)
print('Mean Absolute error: %f' % mean_absolute_error)
print('Root mean square error: %f' % root_mean_square_error)
Ejemplo n.º 24
0
from recsys.algorithm.factorize import SVD
from recsys.evaluation.prediction import RMSE, MAE
import sys

#Dataset
#PERCENT_TRAIN = 80
data = Data()
data.load('./ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int})

#Load SVD from /tmp
svd2 = SVD(filename='/tmp/movielens') # Loading already computed SVD model

#Predict User rating for given user and movie:
USERID = 2   
ITEMID= 1 # Toy Story
rating1=svd2.predict(ITEMID, USERID, 0.0, 5.0)
print 'Predicted rating=%f'% rating1

flag=0
#Retrieve actual rating for given user and movie
for rating, item_id, user_id in data.get():
	if user_id == USERID and item_id == ITEMID:
		rat = rating
		#print 'Actual rating=%f' % rating
		flag=1
		break
		
if flag == 1:
	print 'Actual rating=%f'% rat
else :
	sys.exit("No actual rating available")
Ejemplo n.º 25
0
class Recommender:
    def __init__(self, datafile_path=None):
        self.svd = SVD()
        self.matrix = None
        self.datafile_path = datafile_path
        self.predict_matrix = None
        self.load_local_data(self.datafile_path, 100, 0)

    def load_web_data(self,
                      filename,
                      film_names_with_rate_list,
                      K,
                      min_values,
                      MAX_COUNT_USER_FILMS=None,
                      MAX_COUNT_FILM_USERS=None):
        self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\
            create_matrix_by_film_titles(film_names_with_rate_list)
        self.matrix.save_rating_matrix_as_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def load_local_data(self, filename, K, min_values):
        self.matrix = rm.MatrixCreator().restore_from_file(filename)
        self.datafile_path = filename
        self.__compute_matrix(K, min_values)

    def get_predictions_for_all_users(self,
                                      min_rate=1,
                                      max_rate=10,
                                      top=None,
                                      K=None,
                                      min_values=0):
        if K:
            self.__compute_matrix(K)

        self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map),
                                        len(self.matrix.films_indexes_map)))
        for user in self.matrix.users_indexes_map.keys():
            for film in self.matrix.films_indexes_map.keys():
                user_index = self.matrix.users_indexes_map[user]
                film_index = self.matrix.films_indexes_map[film]
                self.predict_matrix[user_index][film_index] = self.svd.predict(
                    user_index,
                    film_index,
                    MIN_VALUE=min_rate,
                    MAX_VALUE=max_rate)
        return self.predict_matrix

    def predict_for_user(self,
                         user_index,
                         min_rate=1,
                         max_rate=10,
                         top=None,
                         repeat=False,
                         K=None,
                         min_values=None):
        """
        :param K: to change the number of properties
        :return: {Film : int(rate), ...} or
                [(Film, int(rate)), ...] if top is not None
        """
        if K:
            self.__compute_matrix(K)

        prediction = {}
        np_matrix = self.matrix.get_rating_matrix()
        for index in xrange(np_matrix.shape[1]):
            rate = self.svd.predict(user_index,
                                    index,
                                    MIN_VALUE=min_rate,
                                    MAX_VALUE=max_rate)
            film = self.matrix.indexes_films_map[index]
            prediction[film] = rate

        if not repeat:
            fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0]
            user = self.matrix.indexes_users_map[fake_user_index]
            films = user.get_preferences().keys()

            prediction = [(x, prediction[x]) for x in prediction
                          if x not in films]

        if top:
            prediction = sorted(prediction.items(), key=operator.itemgetter(1))
            prediction = list(reversed(prediction[-top:]))

        return prediction

    def predict_for_all_fake_users(self,
                                   min_rate=1,
                                   max_rate=10,
                                   top=None,
                                   K=None,
                                   min_values=0):
        """
        :param K: to change the number of properties
        :return: [{Film : int(rate), ...}, ...]
        """
        if K:
            self.__compute_matrix(K)

        predictions = []

        for user_index in self.matrix.indexes_with_fake_user_ids.keys():
            prediction = self.predict_for_user(user_index, min_rate, max_rate,
                                               top)
            predictions.append(prediction)

        return predictions

    def predicted_rating_submatrix(self, user_indexes):
        self.__compute_matrix(100)
        predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int)
        for index in user_indexes:
            row = []
            for film_index in xrange(self.matrix.rating_matrix.shape[1]):
                row.append(
                    self.svd.predict(index,
                                     film_index,
                                     MIN_VALUE=1,
                                     MAX_VALUE=10))

            predicted = np.append(predicted, [row], axis=0)
        return predicted[1:]

    def predicted_rating_submatrix_for_fake(self):
        return self.predicted_rating_submatrix(
            self.matrix.indexes_with_fake_user_ids.keys())

    def __compute_matrix(self,
                         K,
                         min_values=0,
                         pre_normalize=None,
                         mean_center=True,
                         post_normalize=True):
        self.svd.load_data(self.datafile_path,
                           sep=' ',
                           format={
                               'col': 1,
                               'row': 0,
                               'value': 2,
                               'ids': int
                           })
        self.svd.compute(K,
                         min_values,
                         pre_normalize,
                         mean_center,
                         post_normalize,
                         savefile=None)

    def filter_films_data(self, min_user_votes):
        film_indexes = []
        counter = collections.Counter()
        with open(self.datafile_path, 'rb') as my_file:
            r = csv.reader(my_file)
            for row in r:
                user_index, film_index, rate = row[0].split(' ')
                counter[int(film_index)] += 1

            for k, v in counter.iteritems():
                if v < min_user_votes:
                    film_indexes.append(k)

        copyfile(self.datafile_path + '_user_map',
                 self.datafile_path + '_' + str(min_user_votes) + '_user_map')

        new_indexes = {}
        with open(self.datafile_path + '_film_map', 'rb') as read_file:
            r = csv.reader(read_file)
            with open(
                    self.datafile_path + '_' + str(min_user_votes) +
                    '_film_map', 'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                index = 0
                for row in r:
                    film_index, film_id = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    new_indexes[film_index] = index
                    wr.writerow([index, film_id])
                    index += 1

        with open(self.datafile_path, 'rb') as read_file:
            r = csv.reader(read_file)
            with open(self.datafile_path + '_' + str(min_user_votes),
                      'wb') as write_file:
                wr = csv.writer(write_file, delimiter=' ')
                for row in r:
                    user_index, film_index, rate = row[0].split(' ')
                    if int(film_index) in film_indexes:
                        continue
                    wr.writerow([user_index, new_indexes[film_index], rate])
Ejemplo n.º 26
0
import recsys.algorithm
recsys.algorithm.VERBOSE = True

from recsys.algorithm.factorize import SVD
svd = SVD()
svd.load_data(filename='train.csv', sep=',', format={'col':0, 'row':1, 'value':2})

k = 100
svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True)

MIN_RATING = 0.0
MAX_RATING = 5000.0

import csv
test_file = 'test.csv'
soln_file = 'recsys.csv'

with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])

        for row in test_csv:
            id     = row[0]
            user   = row[1]
            artist = row[2]
            res    = svd.predict(artist, user, MIN_RATING, MAX_RATING)
            soln_csv.writerow([id, res])
Ejemplo n.º 27
0
                  'col': 0,
                  'row': 1,
                  'value': 2,
                  'ids': float
              })

k = 30
svd.compute(k=k,
            min_values=10,
            pre_normalize=None,
            mean_center=True,
            post_normalize=True,
            savefile='/tmp/movielens')

# ITEMID1 = 1    # Toy Story (1995)
# ITEMID2 = 2355 # A bug's life (1998)

# print svd.similarity(ITEMID1, ITEMID2)

MIN_RATING = 1.0
MAX_RATING = 5.0

USERID = 1
ITEMID = 1129

print svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)
print svd.predict(1953, 1, MIN_RATING, MAX_RATING)
# Predicted value 5.0

print svd.get_matrix().value(1953, 1)
# Real value 5.0
Ejemplo n.º 28
0
#3.10
[items_full[str(x[0])].get_data() for x in films]

#3.11
get_name_item_reviewed(10,user_full,items_full)

#3.12
items_full[str(2628)].get_data()
users_for_star_wars = svd.recommend(2628,only_unknowns=True)
users_for_star_wars

#3.13
movies_reviewed_by_sw_rec  =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars]
movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list]
movie_aggregate = movies_by_category(movies_flatten, 3)
movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True)
movies_sort

#3.14
from recsys.evaluation.prediction import RMSE
err = RMSE()
for rating, item_id, user_id in data.get():
    try:
        prediction = svd.predict(item_id, user_id)
        err.add(rating, prediction)
    except KeyError, k:
        continue

print 'RMSE is ' + str(err.compute())
Ejemplo n.º 29
0
class Recommender():
    def __init__(self, train, test):
        recsys.algorithm.VERBOSE = True
        self.train = train
        self.test = test
        self.svd = SVD()
        self.svd.set_data(train)

    def set_train(self, train):
        self.train = train

    def set_test(self, test):
        self.test = test

    def get_train(self):
        return self.train

    def get_test(self):
        return self.test

    def get_alluserid(self, dataset):
        userid_list = []
        for rating, item_id, user_id in dataset.get():
            if user_id not in userid_list:
                userid_list.append(user_id)
        return userid_list

    def get_allitemid(self, dataset):
        itemid_list = []
        for rating, item_id, user_id in dataset.get():
            if item_id not in itemid_list:
                itemid_list.append(item_id)
        return itemid_list

    def eval_rmse(self):
        # Evaluation using prediction-based metrics
        rmse = RMSE()
        mae = MAE()
        for rating, item_id, user_id in self.test.get():
            try:
                pred_rating = self.svd.predict(item_id, user_id)
                rmse.add(rating, pred_rating)
                mae.add(rating, pred_rating)
            except KeyError:
                continue
        print 'RMSE=%s' % rmse.compute()
        print 'MAE=%s' % mae.compute()

    def recommend(self, N=10, only_unknowns=False, is_row=True):
        rec_list = {}
        for rating, item_id, user_id in self.test.get():
            if user_id in self.get_alluserid(self.train):
                rec_list[user_id] = self.svd.recommend(user_id, n=N, only_unknowns=False, is_row=False)
                print rec_list[user_id]
        return rec_list

    def precisionRecall(self, rec_list2, test_dict):
        print "Start calculate precision and recall..."
        hit = 0
        n_recall = 0
        n_precision = 0
        for user, items in test_dict.items():
             if user not in self.get_alluserid(self.train):
                 continue
             rec_list = self.svd.recommend(user, n=30, only_unknowns=False, is_row=False)
             r = [i[0] for i in rec_list]
             print 'rec_list', r
             hit += len(list(set(r) & set(items.keys())))
             n_recall += len(items)
             n_precision += 30
        return [hit / (1.0 * n_recall), hit / (1.0 * n_precision)]
Ejemplo n.º 30
0
                     pre_normalize=None,
                     mean_center=True,
                     post_normalize=True)

    # Evaluate
    rmse_svd = RMSE()
    mae_svd = MAE()
    rmse_svd_neig = RMSE()
    mae_svd_neig = MAE()

    i = 1
    total = len(test.get())
    print 'Total Test ratings: %s' % total
    for rating, item_id, user_id in test:
        try:
            pred_rating_svd = svd.predict(item_id, user_id)
            rmse_svd.add(rating, pred_rating_svd)
            mae_svd.add(rating, pred_rating_svd)

            pred_rating_svd_neig = svd_neig.predict(item_id,
                                                    user_id)  #Koren & co.
            if pred_rating_svd_neig is not nan:
                rmse_svd_neig.add(rating, pred_rating_svd_neig)
                mae_svd_neig.add(rating, pred_rating_svd_neig)

            print "\rProcessed test rating %d" % i,
            sys.stdout.flush()

            i += 1
        except KeyError:
            continue
Ejemplo n.º 31
0
get_name_item_reviewed(10, user_full, items_full)

#3.12
items_full[str(2628)].get_data()
users_for_star_wars = svd.recommend(2628, only_unknowns=True)
users_for_star_wars

#3.13
movies_reviewed_by_sw_rec = [
    get_name_item_reviewed(x[0], user_full, items_full)
    for x in users_for_star_wars
]
movies_flatten = [
    movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list
]
movie_aggregate = movies_by_category(movies_flatten, 3)
movies_sort = sorted(movie_aggregate, key=lambda x: x[1], reverse=True)
movies_sort

#3.14
from recsys.evaluation.prediction import RMSE
err = RMSE()
for rating, item_id, user_id in data.get():
    try:
        prediction = svd.predict(item_id, user_id)
        err.add(rating, prediction)
    except KeyError, k:
        continue

print 'RMSE is ' + str(err.compute())
Ejemplo n.º 32
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

data = [(4.0, 'user1', 'item1'),
 (2.0, 'user1', 'item3'),
 (1.0, 'user2', 'item1'),
 (5.0, 'user2', 'item4')]

d = Data()
d.set(data)
svd = SVD()
svd.set_data(d)
m = svd.get_matrix()
svd.compute(k=2)
print svd.similar('user1')
print svd.predict('user1', 'item1')
Ejemplo n.º 33
0
def svd(filepath):

    src_folder = parseOutputFolderPath(filepath)
    base_file_name = parseFileName(filepath)

    avg_rmse = 0.0
    avg_mae = 0.0

    out_file_base = base_file_name + "_pred_svd"
    out_file = open(src_folder + "output/" + out_file_base + EXT, "w")

    # for each fold
    for fold_index in xrange(1, NUM_FOLDS + 1):

        print "*** \t FOLD {0} \t ***".format(fold_index)

        M_test = lil_matrix((_N, _M))
        rmse = 0.0
        mae = 0.0

        train_path = src_folder + base_file_name + TRAIN_PREFIX + str(
            fold_index) + EXT
        test_path = src_folder + base_file_name + TEST_PREFIX + str(
            fold_index) + EXT

        print train_path
        print test_path

        svd = SVD()
        svd.load_data(filename=train_path,
                      sep=',',
                      format={
                          'col': 0,
                          'row': 1,
                          'value': 2,
                          'ids': float
                      })

        svd.compute(k=_K,
                    min_values=1,
                    pre_normalize=None,
                    mean_center=True,
                    post_normalize=True)

        with open(test_path, "r") as infile:
            reader = csv.reader(infile, delimiter=",")
            for line in reader:
                userid = int(line[0], 10)
                movieid = int(line[1], 10)
                score = float(line[2])
                M_test[userid, movieid] = score

        # GROUND_TRUTH = [3.0, 1.0, 5.0, 2.0, 3.0]
        # TEST = [2.3, 0.9, 4.9, 0.9, 1.5]
        # mae = MAE()
        # mae.load_ground_truth(GROUND_TRUTH)
        # mae.load_test(TEST)
        # mae.compute() #returns 0.7

        # write predictions only for first test (fold)
        if (fold_index == 1):
            rows, cols = M_test.nonzero()
            for row, col in zip(rows, cols):
                try:
                    r_xi = svd.predict(col, row, MIN_RATING, MAX_RATING)
                except:
                    print row, col
                out_file.write(
                    str(row) + '\t' + str(col) + '\t' + str(r_xi) + '\n')

        print "..done"
        print ""

        exit()

    out_file.close()

    # average rmse and mae on validation folds
    eval_out_path = src_folder + "output/" + out_file_base + "_eval" + EXT

    with open(eval_out_path, "w") as file:
        file.write("RMSE" + "\t" + "MAE" + "\n")
        avg_rmse /= float(NUM_FOLDS)
        avg_mae /= float(NUM_FOLDS)
        file.write(str(avg_rmse) + "\t" + str(avg_mae))
Ejemplo n.º 34
0
from recsys.algorithm.factorize import SVD
from recsys.datamodel.data import Data

filename = "./data/ratings.dat"
data = Data()
format = {'col': 0, 'row': 1, 'value': 2, 'ids': int}
# About format parameter:
#   'row': 1 -> Rows in matrix come from second column in ratings.dat file
#   'col': 0 -> Cols in matrix come from first column in ratings.dat file
#   'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file
#   'ids': int -> Ids (row and col ids) are integers (not strings)
data.load(filename, sep="::", format=format)
train, test = data.split_train_test(percent=80)  # 80% train ,20%test

svd = SVD()
svd.set_data(train)

print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0))
# the prediction for user loving item
print(svd.recommend(1, n=10, only_unknowns=True, is_row=False))
#item recomended for user ,only from known
print(svd.recommend(1, n=10, only_unknowns=False, is_row=False))
#item recomended for user
Ejemplo n.º 35
0
recsys.algorithm.VERBOSE = True
print "loading data"
data = Data()
data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int})

topic = 48
print "compute svd"
svd = SVD()
svd.set_data(data)
svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True)

print "loading test data"
test = loadTest('../item_recom/test_info.tsv')

print svd.predict(0,0)

print "creating submission"
with open('../submissions/recsys_3.csv', 'w') as csvfile:
	fieldnames = ['uid#iid', 'pred']
	writer = csv.DictWriter(csvfile, fieldnames)
	writer.writeheader()
	for ind in xrange(len(test)):
		writer.writerow(
			{
				'uid#iid': "%d#%d"%(test[ind]["1_user_id"], test[ind]["2_item_id"]),
				'pred': svd.predict(
					test[ind]["2_item_id"], 
					test[ind]["1_user_id"])
			})
Ejemplo n.º 36
0
class RecommendSystem(object):
    def __init__(self, filename, sep, **format):
        self.filename = filename
        self.sep = sep
        self.format = format

        # 训练参数
        self.k = 100
        self.min_values = 10
        self.post_normalize = True

        self.svd = SVD()

        # 判断是否加载
        self.is_load = False

        # 添加数据处理
        self.data = Data()

        # 添加模型评估
        self.rmse = RMSE()

    def get_data(self):
        """
        获取数据
        :return: None
        """
        # 如果模型不存在
        if not os.path.exists(tmpfile):
            # 如果数据文件不存在
            if not os.path.exists(self.filename):
                sys.exit()
            # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format)
            # 使用Data()来获取数据
            self.data.load(self.filename, sep=self.sep, format=self.format)
            train, test = self.data.split_train_test(percent=80)
            return train, test
        else:
            self.svd.load_model(tmpfile)
            self.is_load = True
            return None, None

    def train(self, train):
        """
        训练模型
        :param train: 训练数据
        :return: None
        """
        if not self.is_load:
            self.svd.set_data(train)
            self.svd.compute(k=self.k,
                             min_values=self.min_values,
                             post_normalize=self.post_normalize,
                             savefile=tmpfile[:-4])
        return None

    def rs_predict(self, itemid, userid):
        """
        评分预测
        :param itemid: 电影id
        :param userid: 用户id
        :return: None
        """
        score = self.svd.predict(itemid, userid)
        print "推荐的分数为:%f" % score
        return score

    def recommend_to_user(self, userid):
        """
        推荐给用户
        :param userid: 用户id
        :return: None
        """
        recommend_list = self.svd.recommend(userid, is_row=False)

        # 读取文件里的电影名称
        movie_list = []

        for line in open(moviefile, "r"):
            movie_list.append(' '.join(line.split("::")[1:2]))

        # 推荐具体电影名字和分数
        for itemid, rate in recommend_list:
            print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate)
        return None

    def evaluation(self, test):
        """
        模型的评估
        :param test: 测试集
        :return: None
        """
        # 如果模型不是直接加载
        if not self.is_load:

            # 循环取出测试集里面的元组数据<评分,电影,用户>
            for value, itemid, userid in test.get():
                try:
                    predict = self.rs_predict(itemid, userid)
                    self.rmse.add(value, predict)
                except KeyError:
                    continue
            # 计算返回误差(均方误差)
            error = self.rmse.compute()

            print "模型误差为%s:" % error

        return None