def ex1(dat_file='./ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2,'ids':int}) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K=100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def evaluate(data, count=5, K=100): results = [] for i in range(count): train, test = data.split_train_test(percent=PERCENT_TRAIN) print len(data.get()), len(train.get()), len(test.get()) #test_in_train(test, train) #print train.get() svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" continue try: rsu = {} rsu["RMSE"] = rmse.compute() rsu["MAE"] = mae.compute() print rsu results.append(rsu) except: print "one error....++++++++++++++++++++++++++++++++++++++++++++++++++++" return results
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def recommended_files(data,user): svd = SVD() svd.set_data(data) svd.compute(k=1000,min_values=0, pre_normalize=None, mean_center=False, post_normalize=True) similar_users = [i[0] for i in svd.similar(user)] #recoms = svd.recommend(user,is_row=True,only_unknowns=True,n=50) predict_arr = [] user_tths = db.user_list.find({'user':user}) tths = [i['tth'] for i in user_tths] movie_names = [] for i in similar_users[1:]: for j in db.user_list.find({'user':i}): if j['tth'] not in tths: movie_name = db.tths.find_one({'tth':j['tth']})['name'] movie_names.append(movie_name) tths.append(j['tth']) predict_arr.append((movie_name,j['tth'],svd.predict(user,j['tth']))) predict_arr = sorted(predict_arr,key=lambda x:x[2],reverse=True) res = [] c_res = 0 for p in predict_arr: flag=0 for r in res: if similar(p[0],r[0]): flag = 1 break if flag == 0: res.append(p[1]) c_res += 1 if c_res > 10: return res
def quickstart(): svd = SVD() recsys.algorithm.VERBOSE = True # load movielens data dat_file = DATA_DIR + 'ml-1m-ratings.dat' svd.load_data(filename=dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # compute svd k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) pdb.set_trace() # movie id's ITEMID1 = 1 # toy story ITEMID2 = 1221 # godfather II # get movies similar to toy story print svd.similar(ITEMID1) # get predicted rating for given user & movie MIN_RATING = 0.0 MAX_RATING = 5.0 USERID = 1 ITEMID = 1 # get predicted rating for user1 and item1, mapped onto min max pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) actual = svd.get_matrix().value(ITEMID, USERID) print 'predicted rating = {0}'.format(pred) print 'actual rating = {0}'.format(actual) print 'which users should see Toy Story?:' print svd.recommend(ITEMID)
def ex1(dat_file=DATA_DIR + 'ml-1m-ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() # mae is mean ABSOLUTE error # ... in this case it will return 1.09 which means there is an error of almost 1 point out of 5 mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def get_mae_rmse(step): data = Data() format = {'col': 1, 'row': 0, 'value': 2, 'ids': 'str'} filename = 'second_train_test.dat.{step}'.format(step=step) data.load(filename, sep='::', format=format) train, test = data.split_train_test(percent=80) try: svd = SVD('svdn_model_{step}.zip'.format(step=step)) print('Loading model... {step}'.format(step=step)) except: return mae_predicted, rmse_predicted = [], [] for rating, item_id, user_id in test: try: predicted = svd.predict(item_id, user_id) mae_predicted.append((rating, predicted)) rmse_predicted.append((rating, predicted)) except: pass mae_value, rmse_value = np.nan, np.nan if len(mae_predicted) > 0: mae = MAE(mae_predicted) mae_value = mae.compute() if len(rmse_predicted) > 0: rmse = RMSE(rmse_predicted) rmse_value = rmse.compute() return mae_value, rmse_value
def evaulte(train_set, test_set): svd = SVD() svd.set_data(train_set) svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True) mae = MAE() k_err = 0 for rating, item_id, user_id in test_set.get(): try: pred_rating = svd.predict(item_id, user_id) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" k_err += 1 continue print "k_err", k_err, " -- ", "test-len: ", len(test_set.get()), "train-len: ", len(train_set.get()) result = mae.compute()/2.0 return result
def calculate_stats_users(pct_train): dat_file = 'user_data_working.csv' data = Data() data.load(dat_file, sep=',', format={'col':0, 'row':1, 'value':2,'ids':int}) train, test = data.split_train_test(percent=pct_train) svd = SVD() svd.set_data(train) svd.compute(k=100, min_values=2, pre_normalize=None, mean_center=True, post_normalize=False) rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s\n' % mae.compute()
def ex1(dat_file='ml-1m/ratings.dat', pct_train=0.5): data = Data() data.load(dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) # About format parameter: # 'row': 1 -> Rows in matrix come from column 1 in ratings.dat file # 'col': 0 -> Cols in matrix come from column 0 in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from column 2 in ratings.dat # file # 'ids': int -> Ids (row and col ids) are integers (not strings) # create train/test split train, test = data.split_train_test(percent=pct_train) # create svd K = 100 svd = SVD() svd.set_data(train) svd.compute( k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) # evaluate performance rmse = RMSE() mae = MAE() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute()
def quickstart(): svd = SVD() recsys.algorithm.VERBOSE = True # load movielens data dat_file = 'ml-1m/ratings.dat' svd.load_data(filename=dat_file, sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) # compute svd k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) pdb.set_trace() # movie id's ITEMID1 = 1 # toy story ITEMID2 = 1221 # godfather II # get movies similar to toy story svd.similar(ITEMID1) # get predicted rating for given user & movie MIN_RATING = 0.0 MAX_RATING = 5.0 USERID = 1 ITEMID = 1 # get predicted rating pred = svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) actual = svd.get_matrix().value(ITEMID, USERID) print 'predicted rating = {0}'.format(pred) print 'actual rating = {0}'.format(actual) # which users should see Toy Story? svd.recommend(ITEMID)
def evaulte(train_set, test_set): svd = SVD() svd.set_data(train_set) svd.compute(k=KKK, min_values=MIN_ITEM, pre_normalize=None, mean_center=True, post_normalize=True) mae = MAE() k_err = 0 for rating, item_id, user_id in test_set.get(): try: pred_rating = svd.predict(item_id, user_id) mae.add(rating, pred_rating) except KeyError: #print "keyerror: ===========================================================>" k_err += 1 continue print "k_err", k_err, " -- ", "test-len: ", len( test_set.get()), "train-len: ", len(train_set.get()) result = mae.compute() / 2.0 return result
'ids': str }) k = 200 svd.compute(k=k, savefile='../tmp/weight') svd2 = SVD(filename='../tmp/weight') # Loading already computed SVD model output_path = "./output.txt" output_file = open(output_path, 'w') validate_file = file("../validate_nolabel.txt") line = validate_file.readline() line = validate_file.readline().strip("\r\n") while line: question_id = line.split(',')[0] user_id = line.split(',')[1] try: predict = svd2.predict(user_id, question_id, 0.0, 1.0) except: predict = 0 print question_id + "," + user_id + " Exception" if predict > 1.0: predict = 1.0 if predict < 0.0001: predict = 0.0 result = question_id + "," + user_id + "," + str(predict) #print result output_file.write(result) output_file.write("\n") line = validate_file.readline().strip("\r\n")
print(json.dumps(similaries, ensure_ascii=False)) # import pdb;pdb.set_trace() import sys sys.exit(0) print(svd.similar(ITEMID1)) # Returns: <ITEMID, Cosine Similarity Value> MIN_RATING = 0.0 MAX_RATING = 1.0 ITEMID = 109 USERID = 3837663637323963363639393565373833613237396534393132376338386362 print('testing..') print(svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING)) # Predicted value 5.0 print(svd.get_matrix().value(ITEMID, USERID)) # Real value 5.0 # Recommend (non-rated) movies to a user: print('recommend to user') print(svd.recommend(USERID, is_row=False)) #cols are users and rows are items, thus we set is_row=False print(svd.recommend(ITEMID)) import pdb;pdb.set_trace()
#svd.set_data(train) #假设奇异值的个数为100 k = 100 svd.compute(k=k, min_values=1, pre_normalize=None, mean_center=False, post_normalize=True) #svd.compute(k=k,min_values=10,pre_normalize=None,mean_center=True,post_normalize=True,savefile='/tmp/movielens') #你可以计算两个电影的相似度 ITEMID1 = 3 ITEMID2 = 3 #svd.similarity(ITEMID1,ITEMID2) print svd.similar(ITEMID1, ITEMID2) #或者得到类似的电影 print svd.similar(ITEMID1) #再预测一下用户对电影的评分 MIN_RATING = 1.0 MAX_RATING = 5.0 USERID1 = 30 print svd.predict(ITEMID1, USERID1, MIN_RATING, MAX_RATING) #重头戏,推荐电影给用户! print svd.recommend(USERID1, is_row=False) #谁应该看这部电影 print svd.recommend(ITEMID1)
# Compute SVD svd.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) svd_neig.compute(k=K, min_values=None, pre_normalize=None, mean_center=True, post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print "Total Test ratings: %s" % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) # Koren & co. if pred_rating_svd_neig is not nan: rmse_svd_neig.add(rating, pred_rating_svd_neig) mae_svd_neig.add(rating, pred_rating_svd_neig) print "\rProcessed test rating %d" % i, sys.stdout.flush() i += 1 except KeyError: continue
pre_normalize=None, mean_center=True, post_normalize=True) # predicted_rating = svd.predict(int(5), 'A1', 1, 10) # predicted_rating2 = svd.predict(int(1), 'A1', 1, 10) # print('Predicted rating', predicted_rating) # print('Predicted rating', predicted_rating2) records = ETLUtils.load_csv_file(file_name_header, '|') errors = [] for record in records: try: # print(record['user'], record['item'], record['rating']) user = record['user'] item = int(record['item']) predicted_rating = svd.predict(item, user, 1, 5) print(record['user'], record['item'], predicted_rating) # predicted_rating = round(predicted_rating) actual_rating = svd.get_matrix().value(item, user) error = abs(predicted_rating - actual_rating) errors.append(error) except KeyError: continue mean_absolute_error = MeanAbsoluteError.compute_list(errors) root_mean_square_error = RootMeanSquareError.compute_list(errors) print('Mean Absolute error: %f' % mean_absolute_error) print('Root mean square error: %f' % root_mean_square_error)
post_normalize=True) #svd.compute(k=K, min_values=5, pre_normalize=None, mean_center=True, post_normalize=True) #svd.compute(k=K, pre_normalize=None, mean_center=True, post_normalize=True) print '' print 'COMPUTING SIMILARITY' print svd.similarity(1, 2) # similarity between items print svd.similar(1, 5) # show 5 similar items print '' print 'GENERATING PREDICTION' MIN_RATING = 0.0 MAX_RATING = 5.0 ITEMID = 1 USERID = 1 print svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) # predicted rating value print svd.get_matrix().value(ITEMID, USERID) # real rating value print '' print 'GENERATING RECOMMENDATION' print svd.recommend(USERID, n=5, only_unknowns=True, is_row=False) #Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() spearman = SpearmanRho() kendall = KendallTau() #decision = PrecisionRecallF1() for rating, item_id, user_id in test.get(): try: pred_rating = svd.predict(item_id, user_id)
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data data = [(4.0, 'user1', 'item1'), (2.0, 'user1', 'item3'), (1.0, 'user2', 'item1'), (5.0, 'user2', 'item4')] d = Data() d.set(data) svd = SVD() svd.set_data(d) m = svd.get_matrix() svd.compute(k=2) print svd.similar('user1') print svd.predict('user1', 'item1')
(3114, 0.87060391051018071), # Toy Story 2 (2355, 0.67706936677315799), # A bug's life (588, 0.5807351496754426), # Aladdin (595, 0.46031829709743477), # Beauty and the Beast (1907, 0.44589398718134365), # Mulan (364, 0.42908159895574161), # The Lion King (2081, 0.42566581277820803), # The Little Mermaid (3396, 0.42474056361935913), # The Muppet Movie (2761, 0.40439361857585354)] # The Iron Giant MIN_RATING = 0.0 MAX_RATING = 5.0 ITEMID = 1 USERID = 1 svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) # Predicted value 5.0 svd.get_matrix().value(ITEMID, USERID) # Real value 5.0 svd.recommend(USERID, is_row=False) #cols are users and rows are items, thus we set is_row=False # Returns: <ITEMID, Predicted Rating> [(2905, 5.2133848204673416), # Shaggy D.A., The (318, 5.2052108435956033), # Shawshank Redemption, The (2019, 5.1037438278755474), # Seven Samurai (The Magnificent Seven) (1178, 5.0962756861447023), # Paths of Glory (1957) (904, 5.0771405690055724), # Rear Window (1954) (1250, 5.0744156653222436), # Bridge on the River Kwai, The (858, 5.0650911066862907), # Godfather, The
class RecommendSystem(object): def __init__(self, filename, sep, **format): # 文件信息 self.filename = filename self.sep = sep self.format = format # 初始化矩阵分解 self.svd = SVD() # 矩阵信息 self.k = 100 # 矩阵的隐因子睡昂 self.min_values = 10 # 删除评分少于10人的电影 self.post_normalize = False # 设置是否加载模型标志 self.load_model = False # 初始化均方误差 self.rmse = RMSE() def get_data(self): # 如果模型不存在,则需要加载数据 if not os.path.exists(filename): if not os.path.exists(self.filename): sys.exit() # SVD加载数据 # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) data = Data() data.load(self.filename, sep=self.sep, format=self.format) # 分割数据集 train, test = data.split_train_test(percent=80) return train, test else: # 直接加载模型 self.svd.load_model(filename) # 将是否加载模型设为True self.load_model = True return None, None def train(self, train): """ 训练数据 :param train: 训练集 :return: """ if not self.load_model: # svd去获取训练数据集 self.svd.set_data(train) # 注意传入的文件名字,不是带后缀名 self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=filename[:-4]) return None def recommend_to_user(self, userid): """ 推荐结果 :param usrid: 用于ID :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 打印电影的名称,和预测的评分 # 构建电影名字的列表 movies_list = [] for line in open("./data/ml-1m/movies.dat", "r"): movies_list.append(' '.join(line.split("::")[1:2])) # 依次取出推荐ID for itemid, rating in recommend_list: print "给你推荐的电影叫%s, 预测你对它的评分是%f" % (movies_list[itemid], rating) return None def rs_predict(self, userid, itemid): """ 得出评分 :param userid: 用户ID :param itemid: 物品ID :return: 评分 """ score = self.svd.predict(itemid, userid) return score def evaluation(self, test): """ 均方误差评估模型 :param test: 测试数据 :return: None """ if not self.load_model: # 获取测试数据中的id,rat, <rat, row(itemid), col(userid)> for rating, itemid, userid in test.get(): try: # rating真是值 score = self.rs_predict(userid, itemid) # 添加所有的测试数据 self.rmse.add(rating, score) except KeyError: continue error = self.rmse.compute() print "均方误差为:%s" % error return None
class Recommender: def __init__(self, datafile_path=None): self.svd = SVD() self.matrix = None self.datafile_path = datafile_path self.predict_matrix = None self.load_local_data(self.datafile_path, 100, 0) def load_web_data(self, filename, film_names_with_rate_list, K, min_values, MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None): self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\ create_matrix_by_film_titles(film_names_with_rate_list) self.matrix.save_rating_matrix_as_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def load_local_data(self, filename, K, min_values): self.matrix = rm.MatrixCreator().restore_from_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0): if K: self.__compute_matrix(K) self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map))) for user in self.matrix.users_indexes_map.keys(): for film in self.matrix.films_indexes_map.keys(): user_index = self.matrix.users_indexes_map[user] film_index = self.matrix.films_indexes_map[film] self.predict_matrix[user_index][film_index] = self.svd.predict(user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) return self.predict_matrix def predict_for_user(self, user_index, min_rate=1, max_rate=10, top = None, repeat=False, K=None, min_values=None): """ :param K: to change the number of properties :return: {Film : int(rate), ...} or [(Film, int(rate)), ...] if top is not None """ if K: self.__compute_matrix(K) prediction = {} np_matrix = self.matrix.get_rating_matrix() for index in xrange(np_matrix.shape[1]): rate = self.svd.predict(user_index, index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) film = self.matrix.indexes_films_map[index] prediction[film] = rate if not repeat: fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0] user = self.matrix.indexes_users_map[fake_user_index] films = user.get_preferences().keys() prediction = [(x, prediction[x]) for x in prediction if x not in films] if top: prediction = sorted(prediction.items(), key=operator.itemgetter(1)) prediction = list(reversed(prediction[-top:])) return prediction def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top = None, K=None, min_values=0): """ :param K: to change the number of properties :return: [{Film : int(rate), ...}, ...] """ if K: self.__compute_matrix(K) predictions = [] for user_index in self.matrix.indexes_with_fake_user_ids.keys(): prediction = self.predict_for_user(user_index, min_rate, max_rate, top) predictions.append(prediction) return predictions def predicted_rating_submatrix(self, user_indexes): self.__compute_matrix(100) predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int) for index in user_indexes: row = [] for film_index in xrange(self.matrix.rating_matrix.shape[1]): row.append(self.svd.predict(index, film_index, MIN_VALUE=1, MAX_VALUE=10)) predicted = np.append(predicted, [row], axis=0) return predicted[1:] def predicted_rating_submatrix_for_fake(self): return self.predicted_rating_submatrix(self.matrix.indexes_with_fake_user_ids.keys()) def __compute_matrix(self, K, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True): self.svd.load_data(self.datafile_path, sep=' ', format={'col': 1, 'row': 0, 'value': 2, 'ids': int}) self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None) def filter_films_data(self, min_user_votes): film_indexes = [] counter = collections.Counter() with open(self.datafile_path, 'rb') as my_file: r = csv.reader(my_file) for row in r: user_index, film_index, rate = row[0].split(' ') counter[int(film_index)] += 1 for k, v in counter.iteritems(): if v < min_user_votes: film_indexes.append(k) copyfile(self.datafile_path+'_user_map', self.datafile_path+'_'+str(min_user_votes)+'_user_map') new_indexes = {} with open(self.datafile_path+'_film_map', 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path+'_'+str(min_user_votes)+'_film_map', 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') index = 0 for row in r: film_index, film_id = row[0].split(' ') if int(film_index) in film_indexes: continue new_indexes[film_index] = index wr.writerow([index, film_id]) index += 1 with open(self.datafile_path, 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path+'_'+str(min_user_votes), 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') for row in r: user_index, film_index, rate = row[0].split(' ') if int(film_index) in film_indexes: continue wr.writerow([user_index, new_indexes[film_index], rate])
k = 100 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True) # predicted_rating = svd.predict(int(5), 'A1', 1, 10) # predicted_rating2 = svd.predict(int(1), 'A1', 1, 10) # print('Predicted rating', predicted_rating) # print('Predicted rating', predicted_rating2) records = ETLUtils.load_csv_file(file_name_header, '|') errors = [] for record in records: try: # print(record['user'], record['item'], record['rating']) user = record['user'] item = int(record['item']) predicted_rating = svd.predict(item, user, 1, 5) print(record['user'], record['item'], predicted_rating) # predicted_rating = round(predicted_rating) actual_rating = svd.get_matrix().value(item, user) error = abs(predicted_rating - actual_rating) errors.append(error) except KeyError: continue mean_absolute_error = MeanAbsoluteError.compute_list(errors) root_mean_square_error = RootMeanSquareError.compute_list(errors) print('Mean Absolute error: %f' % mean_absolute_error) print('Root mean square error: %f' % root_mean_square_error)
from recsys.algorithm.factorize import SVD from recsys.evaluation.prediction import RMSE, MAE import sys #Dataset #PERCENT_TRAIN = 80 data = Data() data.load('./ml-1m/ratings.dat', sep='::', format={'col':0, 'row':1, 'value':2, 'ids':int}) #Load SVD from /tmp svd2 = SVD(filename='/tmp/movielens') # Loading already computed SVD model #Predict User rating for given user and movie: USERID = 2 ITEMID= 1 # Toy Story rating1=svd2.predict(ITEMID, USERID, 0.0, 5.0) print 'Predicted rating=%f'% rating1 flag=0 #Retrieve actual rating for given user and movie for rating, item_id, user_id in data.get(): if user_id == USERID and item_id == ITEMID: rat = rating #print 'Actual rating=%f' % rating flag=1 break if flag == 1: print 'Actual rating=%f'% rat else : sys.exit("No actual rating available")
class Recommender: def __init__(self, datafile_path=None): self.svd = SVD() self.matrix = None self.datafile_path = datafile_path self.predict_matrix = None self.load_local_data(self.datafile_path, 100, 0) def load_web_data(self, filename, film_names_with_rate_list, K, min_values, MAX_COUNT_USER_FILMS=None, MAX_COUNT_FILM_USERS=None): self.matrix = rm.MatrixCreator(MAX_COUNT_USER_FILMS, MAX_COUNT_FILM_USERS).\ create_matrix_by_film_titles(film_names_with_rate_list) self.matrix.save_rating_matrix_as_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def load_local_data(self, filename, K, min_values): self.matrix = rm.MatrixCreator().restore_from_file(filename) self.datafile_path = filename self.__compute_matrix(K, min_values) def get_predictions_for_all_users(self, min_rate=1, max_rate=10, top=None, K=None, min_values=0): if K: self.__compute_matrix(K) self.predict_matrix = np.zeros((len(self.matrix.users_indexes_map), len(self.matrix.films_indexes_map))) for user in self.matrix.users_indexes_map.keys(): for film in self.matrix.films_indexes_map.keys(): user_index = self.matrix.users_indexes_map[user] film_index = self.matrix.films_indexes_map[film] self.predict_matrix[user_index][film_index] = self.svd.predict( user_index, film_index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) return self.predict_matrix def predict_for_user(self, user_index, min_rate=1, max_rate=10, top=None, repeat=False, K=None, min_values=None): """ :param K: to change the number of properties :return: {Film : int(rate), ...} or [(Film, int(rate)), ...] if top is not None """ if K: self.__compute_matrix(K) prediction = {} np_matrix = self.matrix.get_rating_matrix() for index in xrange(np_matrix.shape[1]): rate = self.svd.predict(user_index, index, MIN_VALUE=min_rate, MAX_VALUE=max_rate) film = self.matrix.indexes_films_map[index] prediction[film] = rate if not repeat: fake_user_index = self.matrix.indexes_with_fake_user_ids.keys()[0] user = self.matrix.indexes_users_map[fake_user_index] films = user.get_preferences().keys() prediction = [(x, prediction[x]) for x in prediction if x not in films] if top: prediction = sorted(prediction.items(), key=operator.itemgetter(1)) prediction = list(reversed(prediction[-top:])) return prediction def predict_for_all_fake_users(self, min_rate=1, max_rate=10, top=None, K=None, min_values=0): """ :param K: to change the number of properties :return: [{Film : int(rate), ...}, ...] """ if K: self.__compute_matrix(K) predictions = [] for user_index in self.matrix.indexes_with_fake_user_ids.keys(): prediction = self.predict_for_user(user_index, min_rate, max_rate, top) predictions.append(prediction) return predictions def predicted_rating_submatrix(self, user_indexes): self.__compute_matrix(100) predicted = np.empty((1, self.matrix.rating_matrix.shape[1]), int) for index in user_indexes: row = [] for film_index in xrange(self.matrix.rating_matrix.shape[1]): row.append( self.svd.predict(index, film_index, MIN_VALUE=1, MAX_VALUE=10)) predicted = np.append(predicted, [row], axis=0) return predicted[1:] def predicted_rating_submatrix_for_fake(self): return self.predicted_rating_submatrix( self.matrix.indexes_with_fake_user_ids.keys()) def __compute_matrix(self, K, min_values=0, pre_normalize=None, mean_center=True, post_normalize=True): self.svd.load_data(self.datafile_path, sep=' ', format={ 'col': 1, 'row': 0, 'value': 2, 'ids': int }) self.svd.compute(K, min_values, pre_normalize, mean_center, post_normalize, savefile=None) def filter_films_data(self, min_user_votes): film_indexes = [] counter = collections.Counter() with open(self.datafile_path, 'rb') as my_file: r = csv.reader(my_file) for row in r: user_index, film_index, rate = row[0].split(' ') counter[int(film_index)] += 1 for k, v in counter.iteritems(): if v < min_user_votes: film_indexes.append(k) copyfile(self.datafile_path + '_user_map', self.datafile_path + '_' + str(min_user_votes) + '_user_map') new_indexes = {} with open(self.datafile_path + '_film_map', 'rb') as read_file: r = csv.reader(read_file) with open( self.datafile_path + '_' + str(min_user_votes) + '_film_map', 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') index = 0 for row in r: film_index, film_id = row[0].split(' ') if int(film_index) in film_indexes: continue new_indexes[film_index] = index wr.writerow([index, film_id]) index += 1 with open(self.datafile_path, 'rb') as read_file: r = csv.reader(read_file) with open(self.datafile_path + '_' + str(min_user_votes), 'wb') as write_file: wr = csv.writer(write_file, delimiter=' ') for row in r: user_index, film_index, rate = row[0].split(' ') if int(film_index) in film_indexes: continue wr.writerow([user_index, new_indexes[film_index], rate])
import recsys.algorithm recsys.algorithm.VERBOSE = True from recsys.algorithm.factorize import SVD svd = SVD() svd.load_data(filename='train.csv', sep=',', format={'col':0, 'row':1, 'value':2}) k = 100 svd.compute(k=k, pre_normalize=None, mean_center=True, post_normalize=True) MIN_RATING = 0.0 MAX_RATING = 5000.0 import csv test_file = 'test.csv' soln_file = 'recsys.csv' with open(test_file, 'r') as test_fh: test_csv = csv.reader(test_fh, delimiter=',', quotechar='"') next(test_csv, None) with open(soln_file, 'w') as soln_fh: soln_csv = csv.writer(soln_fh, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) soln_csv.writerow(['Id', 'plays']) for row in test_csv: id = row[0] user = row[1] artist = row[2] res = svd.predict(artist, user, MIN_RATING, MAX_RATING) soln_csv.writerow([id, res])
'col': 0, 'row': 1, 'value': 2, 'ids': float }) k = 30 svd.compute(k=k, min_values=10, pre_normalize=None, mean_center=True, post_normalize=True, savefile='/tmp/movielens') # ITEMID1 = 1 # Toy Story (1995) # ITEMID2 = 2355 # A bug's life (1998) # print svd.similarity(ITEMID1, ITEMID2) MIN_RATING = 1.0 MAX_RATING = 5.0 USERID = 1 ITEMID = 1129 print svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) print svd.predict(1953, 1, MIN_RATING, MAX_RATING) # Predicted value 5.0 print svd.get_matrix().value(1953, 1) # Real value 5.0
#3.10 [items_full[str(x[0])].get_data() for x in films] #3.11 get_name_item_reviewed(10,user_full,items_full) #3.12 items_full[str(2628)].get_data() users_for_star_wars = svd.recommend(2628,only_unknowns=True) users_for_star_wars #3.13 movies_reviewed_by_sw_rec =[get_name_item_reviewed(x[0],user_full,items_full) for x in users_for_star_wars] movies_flatten = [movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list] movie_aggregate = movies_by_category(movies_flatten, 3) movies_sort = sorted(movie_aggregate,key=lambda x: x[1], reverse=True) movies_sort #3.14 from recsys.evaluation.prediction import RMSE err = RMSE() for rating, item_id, user_id in data.get(): try: prediction = svd.predict(item_id, user_id) err.add(rating, prediction) except KeyError, k: continue print 'RMSE is ' + str(err.compute())
class Recommender(): def __init__(self, train, test): recsys.algorithm.VERBOSE = True self.train = train self.test = test self.svd = SVD() self.svd.set_data(train) def set_train(self, train): self.train = train def set_test(self, test): self.test = test def get_train(self): return self.train def get_test(self): return self.test def get_alluserid(self, dataset): userid_list = [] for rating, item_id, user_id in dataset.get(): if user_id not in userid_list: userid_list.append(user_id) return userid_list def get_allitemid(self, dataset): itemid_list = [] for rating, item_id, user_id in dataset.get(): if item_id not in itemid_list: itemid_list.append(item_id) return itemid_list def eval_rmse(self): # Evaluation using prediction-based metrics rmse = RMSE() mae = MAE() for rating, item_id, user_id in self.test.get(): try: pred_rating = self.svd.predict(item_id, user_id) rmse.add(rating, pred_rating) mae.add(rating, pred_rating) except KeyError: continue print 'RMSE=%s' % rmse.compute() print 'MAE=%s' % mae.compute() def recommend(self, N=10, only_unknowns=False, is_row=True): rec_list = {} for rating, item_id, user_id in self.test.get(): if user_id in self.get_alluserid(self.train): rec_list[user_id] = self.svd.recommend(user_id, n=N, only_unknowns=False, is_row=False) print rec_list[user_id] return rec_list def precisionRecall(self, rec_list2, test_dict): print "Start calculate precision and recall..." hit = 0 n_recall = 0 n_precision = 0 for user, items in test_dict.items(): if user not in self.get_alluserid(self.train): continue rec_list = self.svd.recommend(user, n=30, only_unknowns=False, is_row=False) r = [i[0] for i in rec_list] print 'rec_list', r hit += len(list(set(r) & set(items.keys()))) n_recall += len(items) n_precision += 30 return [hit / (1.0 * n_recall), hit / (1.0 * n_precision)]
pre_normalize=None, mean_center=True, post_normalize=True) # Evaluate rmse_svd = RMSE() mae_svd = MAE() rmse_svd_neig = RMSE() mae_svd_neig = MAE() i = 1 total = len(test.get()) print 'Total Test ratings: %s' % total for rating, item_id, user_id in test: try: pred_rating_svd = svd.predict(item_id, user_id) rmse_svd.add(rating, pred_rating_svd) mae_svd.add(rating, pred_rating_svd) pred_rating_svd_neig = svd_neig.predict(item_id, user_id) #Koren & co. if pred_rating_svd_neig is not nan: rmse_svd_neig.add(rating, pred_rating_svd_neig) mae_svd_neig.add(rating, pred_rating_svd_neig) print "\rProcessed test rating %d" % i, sys.stdout.flush() i += 1 except KeyError: continue
get_name_item_reviewed(10, user_full, items_full) #3.12 items_full[str(2628)].get_data() users_for_star_wars = svd.recommend(2628, only_unknowns=True) users_for_star_wars #3.13 movies_reviewed_by_sw_rec = [ get_name_item_reviewed(x[0], user_full, items_full) for x in users_for_star_wars ] movies_flatten = [ movie for movie_list in movies_reviewed_by_sw_rec for movie in movie_list ] movie_aggregate = movies_by_category(movies_flatten, 3) movies_sort = sorted(movie_aggregate, key=lambda x: x[1], reverse=True) movies_sort #3.14 from recsys.evaluation.prediction import RMSE err = RMSE() for rating, item_id, user_id in data.get(): try: prediction = svd.predict(item_id, user_id) err.add(rating, prediction) except KeyError, k: continue print 'RMSE is ' + str(err.compute())
def svd(filepath): src_folder = parseOutputFolderPath(filepath) base_file_name = parseFileName(filepath) avg_rmse = 0.0 avg_mae = 0.0 out_file_base = base_file_name + "_pred_svd" out_file = open(src_folder + "output/" + out_file_base + EXT, "w") # for each fold for fold_index in xrange(1, NUM_FOLDS + 1): print "*** \t FOLD {0} \t ***".format(fold_index) M_test = lil_matrix((_N, _M)) rmse = 0.0 mae = 0.0 train_path = src_folder + base_file_name + TRAIN_PREFIX + str( fold_index) + EXT test_path = src_folder + base_file_name + TEST_PREFIX + str( fold_index) + EXT print train_path print test_path svd = SVD() svd.load_data(filename=train_path, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': float }) svd.compute(k=_K, min_values=1, pre_normalize=None, mean_center=True, post_normalize=True) with open(test_path, "r") as infile: reader = csv.reader(infile, delimiter=",") for line in reader: userid = int(line[0], 10) movieid = int(line[1], 10) score = float(line[2]) M_test[userid, movieid] = score # GROUND_TRUTH = [3.0, 1.0, 5.0, 2.0, 3.0] # TEST = [2.3, 0.9, 4.9, 0.9, 1.5] # mae = MAE() # mae.load_ground_truth(GROUND_TRUTH) # mae.load_test(TEST) # mae.compute() #returns 0.7 # write predictions only for first test (fold) if (fold_index == 1): rows, cols = M_test.nonzero() for row, col in zip(rows, cols): try: r_xi = svd.predict(col, row, MIN_RATING, MAX_RATING) except: print row, col out_file.write( str(row) + '\t' + str(col) + '\t' + str(r_xi) + '\n') print "..done" print "" exit() out_file.close() # average rmse and mae on validation folds eval_out_path = src_folder + "output/" + out_file_base + "_eval" + EXT with open(eval_out_path, "w") as file: file.write("RMSE" + "\t" + "MAE" + "\n") avg_rmse /= float(NUM_FOLDS) avg_mae /= float(NUM_FOLDS) file.write(str(avg_rmse) + "\t" + str(avg_mae))
from recsys.algorithm.factorize import SVD from recsys.datamodel.data import Data filename = "./data/ratings.dat" data = Data() format = {'col': 0, 'row': 1, 'value': 2, 'ids': int} # About format parameter: # 'row': 1 -> Rows in matrix come from second column in ratings.dat file # 'col': 0 -> Cols in matrix come from first column in ratings.dat file # 'value': 2 -> Values (Mij) in matrix come from third column in ratings.dat file # 'ids': int -> Ids (row and col ids) are integers (not strings) data.load(filename, sep="::", format=format) train, test = data.split_train_test(percent=80) # 80% train ,20%test svd = SVD() svd.set_data(train) print(svd.predict(22, 22, MIN_VALUE=0.0, MAX_VALUE=5.0)) # the prediction for user loving item print(svd.recommend(1, n=10, only_unknowns=True, is_row=False)) #item recomended for user ,only from known print(svd.recommend(1, n=10, only_unknowns=False, is_row=False)) #item recomended for user
recsys.algorithm.VERBOSE = True print "loading data" data = Data() data.load('../item_recom/train_info.tsv',sep='\t', format={'col':0, 'row':1, 'value':6, 'ids': int}) topic = 48 print "compute svd" svd = SVD() svd.set_data(data) svd.compute(k=topic, min_values=0.0, pre_normalize=None, mean_center=True, post_normalize=True) print "loading test data" test = loadTest('../item_recom/test_info.tsv') print svd.predict(0,0) print "creating submission" with open('../submissions/recsys_3.csv', 'w') as csvfile: fieldnames = ['uid#iid', 'pred'] writer = csv.DictWriter(csvfile, fieldnames) writer.writeheader() for ind in xrange(len(test)): writer.writerow( { 'uid#iid': "%d#%d"%(test[ind]["1_user_id"], test[ind]["2_item_id"]), 'pred': svd.predict( test[ind]["2_item_id"], test[ind]["1_user_id"]) })
class RecommendSystem(object): def __init__(self, filename, sep, **format): self.filename = filename self.sep = sep self.format = format # 训练参数 self.k = 100 self.min_values = 10 self.post_normalize = True self.svd = SVD() # 判断是否加载 self.is_load = False # 添加数据处理 self.data = Data() # 添加模型评估 self.rmse = RMSE() def get_data(self): """ 获取数据 :return: None """ # 如果模型不存在 if not os.path.exists(tmpfile): # 如果数据文件不存在 if not os.path.exists(self.filename): sys.exit() # self.svd.load_data(filename=self.filename, sep=self.sep, format=self.format) # 使用Data()来获取数据 self.data.load(self.filename, sep=self.sep, format=self.format) train, test = self.data.split_train_test(percent=80) return train, test else: self.svd.load_model(tmpfile) self.is_load = True return None, None def train(self, train): """ 训练模型 :param train: 训练数据 :return: None """ if not self.is_load: self.svd.set_data(train) self.svd.compute(k=self.k, min_values=self.min_values, post_normalize=self.post_normalize, savefile=tmpfile[:-4]) return None def rs_predict(self, itemid, userid): """ 评分预测 :param itemid: 电影id :param userid: 用户id :return: None """ score = self.svd.predict(itemid, userid) print "推荐的分数为:%f" % score return score def recommend_to_user(self, userid): """ 推荐给用户 :param userid: 用户id :return: None """ recommend_list = self.svd.recommend(userid, is_row=False) # 读取文件里的电影名称 movie_list = [] for line in open(moviefile, "r"): movie_list.append(' '.join(line.split("::")[1:2])) # 推荐具体电影名字和分数 for itemid, rate in recommend_list: print "给您推荐了%s,我们预测分数为%s" % (movie_list[itemid], rate) return None def evaluation(self, test): """ 模型的评估 :param test: 测试集 :return: None """ # 如果模型不是直接加载 if not self.is_load: # 循环取出测试集里面的元组数据<评分,电影,用户> for value, itemid, userid in test.get(): try: predict = self.rs_predict(itemid, userid) self.rmse.add(value, predict) except KeyError: continue # 计算返回误差(均方误差) error = self.rmse.compute() print "模型误差为%s:" % error return None