def get_med_recs_(user_id): #dataset data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() #create and fit the KNN classifier algo = KNNBaseline() algo.fit(trainset) d = dict() #store key (id) and value (prediction) in dict for i in range(1,1682): iid = str(i) prediction = algo.predict(user_id, iid, verbose= False) if prediction.est >= 2.3 and prediction.est <= 2.7: d.update({int(iid): prediction.est}) #sort by val sort = sorted(d.items(), key=lambda x: x[1]) #return top 5 ids in a list final_list = list(sort[-1:-6:-1]) return list(final_list)
def get_surprise_recs_(user_id): file_path = ('C:/Users/frank/MovieSurprise/MovieSurprise/u_surpriseTest.tsv') reader = Reader(line_format = 'user item rating timestamp', sep = '\t') data = Dataset.load_from_file(file_path, reader) trainset = data.build_full_trainset() algo = KNNBaseline() algo.fit(trainset) d = dict() #store key (id) and value (prediction) in dict for i in range(1,1682): iid = str(i) prediction = algo.predict(user_id, iid, verbose= False) if prediction.est >= 3: d.update({int(iid): prediction.est}) #sort by val sort = sorted(d.items(), key=lambda x: x[1]) #return top 5 ids in a list final_list = list(sort[-1:-6:-1]) return list(final_list)
def get_best_recs_(user_id): #dataset data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() #create and fit the KNN classifier algo = KNNBaseline() algo.fit(trainset) d = dict() #for each movie, find the prediction. If its predicted rating is 4.5+, then add it to the dict for i in range(1, 1682): iid = str(i) prediction = algo.predict(user_id, iid, verbose= False) if prediction.est >= 4.5: #store the item id as the key, prediction as value d.update({int(iid): prediction.est}) #sort, converted to list where sort[x][0] is the predicted rating value (sorted in asc. order) and sort[x][1] is the movie id sort = sorted(d.items(), key=lambda x: x[1]) #start at last element, end at 6th to last element, step of -1 #what if list is less than 5? this could cause an error final_list = sort[-1:-6:-1] #return a 2d list where the first element is the id and the second is the rating return list(final_list)
def knn_baseline_movie(train, test, ids, Xtest, Xids): """ nearest neighbour approach using the movie baseline Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Baseline Movie') bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01} sim_option = { 'name': 'pearson_baseline', 'min_support': 1, 'user_based': False } algo = KNNBaseline(k=100, bsl_options=bsl_option, sim_options=sim_option, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def eval(user_id): # Step 1: Define variables ratings = pps.get_all_ratings_as_df() # read ratings from database ratings[RATING] = None ratings.loc[ratings[LIKED] == True, RATING] = 1 ratings.loc[ratings[LIKED] == False, RATING] = 0 reader = Reader(rating_scale=(0.0, 1.0)) all_items = ratings.poi_id.unique() # find all items user_rmse = pd.DataFrame(columns=['est', 'true']) # define resulting dataframe for storing the probabilites # Step 2: Iterating over all items and leave out the current iteration's item (x) for training for x in np.nditer(all_items): # Step 2a: Define test dataset -> rating of currentUser and current (leaved out) item testset = ratings[(ratings.user_id == user_id)] testset = testset[(testset.poi_id == x)] # Step 2b: If user has given no rating for this item, the prediction cannot be compared to something true => thus skip if testset.rating.size == 0: continue # Step 2c: Define train dataset -> leave out the current item x trainset = ratings[~ratings.isin(testset).all(1)] trainset = Dataset.load_from_df(trainset[[USER_ID, POI_ID, RATING]], reader) trainset = trainset.build_full_trainset() # Step 2d: Apply algorithm by training and predicting of the item x that was leaved out algo = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) algo.fit(trainset) pred = algo.predict(user_id, np.asscalar(x), r_ui=4, verbose=False) # execute the calculation # Step 2e: Store estimate and true value into output dataframe user_rmse.loc[len(user_rmse)] = [pred.est, np.asscalar(testset.rating)] # Step 3: Calculate the RMSE over all leave out estimatieons confidence = np.mean((user_rmse.est - user_rmse.true)**2) return confidence
def predict(rating_dic): df_clean = pd.read_csv("dataset_clean.csv") ####################### # Fit surprise model ####################### final_model = KNNBaseline(k=60, min_k=2, sim_options={'name': 'pearson_baseline', 'user_based': True}) new_user_id = max(df_clean["userID"]) + 1 ratings = np.array(list(rating_dic.values())) rated_mask = ratings != None ratings = ratings[rated_mask] items = np.array(list(rating_dic.keys()))[rated_mask] user = np.ones(len(items), dtype="int") * new_user_id new_user_df = pd.DataFrame({"userID": user, "itemID": items, "rating": ratings}) total_df = df_clean.append(new_user_df) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(0, 10)) # The columns must correspond to user id, item id and ratings (in that order). new_trainset = Dataset.load_from_df(total_df, reader).build_full_trainset() ## Fit the best model final_model.fit(new_trainset) predicted_ratings = [] for nootropic in nootropics_list: predicted_ratings.append(final_model.predict(new_user_id, nootropic).est) item_baselines = final_model.default_prediction() + final_model.compute_baselines()[ 1] # mean rating + item baseline ? result_df = pd.DataFrame( {"nootropic": nootropics_list, "predicted_rating": predicted_ratings, "baseline_rating": item_baselines}) nootropics_without_ratings = [nootropic for nootropic in nootropics_list if (nootropic not in rating_dic.keys())] new_result_df = result_df[result_df["nootropic"].isin(nootropics_without_ratings)] return new_result_df.sort_values("predicted_rating", ascending=False, ignore_index=True)
def KNN(data, kwargs): # Set algorithm k_neigbor = kwargs.get('n_neigbor') min_neighb = kwargs.get('min_neigbor') similarity = kwargs.get('similarity') options = {'name': similarity} algo = KNNBaseline(k = k_neigbor, min_k = min_neighb, sim_options = options) # Train the algorithm on the data, and predict ratings for the testset algo.fit(data) prediction = np.zeros([10000,1000]) for row in range(10000): for col in range(1000): prediction[row,col] = algo.predict(str(row+1),str(col+1)).est return prediction
algo2 = KNNWithZScore(k=40, min_k=1, sim_options=sim_options1, verbose=True) algo2.fit(data_train.build_full_trainset()) algo3 = KNNWithMeans(k=40, min_k=1, sim_options=sim_options2, verbose=True) algo3.fit(data_train.build_full_trainset()) pred1 = [] pred_f1 = [] pred2 = [] pred_f2 = [] pred3 = [] pred_f3 = [] with open("./data/testing.dat", "r", encoding='utf-8') as f: for line in f.readlines(): line_data = line.strip().split(",") a = algo1.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] b = algo2.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] c = algo3.predict(str(line_data[0]), str(line_data[1]), None, True, True)[3] pred1.append(int(round(a))) pred_f1.append(a) pred2.append(int(round(b))) pred_f2.append(b) pred3.append(int(round(c))) pred_f3.append(c) with open("./雷雨轩_PB18111791_4.txt", "w") as f: for ratings in pred1: f.write(str(ratings) + "\n")
def Search(request): if request.method == 'POST': # Create a form instance and populate it with data from the request (binding): form = SearchForm(request.POST) city_to_search = form.data['city'] # I wont heck if the form is valid... # then I need to give that user two lists of hotels # 1st: train again the model # change this so I read from database # start off from the ratings: df = pd.DataFrame(list(Rating.objects.all().values())) df['user_id'] = df['user_id'].astype(str) reader = Reader(rating_scale=(0, 6)) data = Dataset.load_from_df( df[['user_id', 'hotel_id', 'rating_OVERALL']], reader) # train kNN-Baseline on the whole collection (both, user and item-wise) trainset = data.build_full_trainset() # Build two algorithms, and train them: algo and algo_items. algo = KNNBaseline() algo.fit(trainset) sim_options = {'name': 'pearson_baseline', 'user_based': False} algo_items = KNNBaseline(sim_options=sim_options) algo_items.fit(trainset) # 2nd: top hotels for user # find existing hotels: hotels = df['hotel_id'].unique().tolist() user1 = request.user.username print(df[df['user_id'] == user1]) print(df.info()) print(type(user1)) hot_ratings_user = {} # loop to find ratings for hot in hotels: pred = algo.predict(user1, hot) hot_ratings_user[hot] = pred.est # the whole dictionary should be done now... # we need to filter hotels within the city df_hotels_names = pd.DataFrame(list(Hotel.objects.all().values())) # df_hotels_names has: city hotel_id name price_approx star_class state zipcode # this should be the first context sorted_hot_ratings_user = sorted(hot_ratings_user, key=hot_ratings_user.get, reverse=True) context1 = pd.DataFrame() # remove the printing part for key in sorted_hot_ratings_user: hotcurr = df_hotels_names[df_hotels_names['hotel_id'] == key] if hotcurr['city'].to_string(index=False) == city_to_search: dicttemp = pd.DataFrame({ 'Hotel name': [hotcurr['name'].to_string(index=False)], 'Estimated rating': round(hot_ratings_user[key], 2) }) context1 = context1.append(dicttemp, ignore_index=True) # 3rd: item based: hot_ratings_i = {} for hot in hotels: pred_i = algo_items.predict(user1, hot) hot_ratings_i[hot] = pred_i.est # the whole dictionary should be done now... sorted_hot_ratings_i = sorted(hot_ratings_i, key=hot_ratings_i.get, reverse=True) context2 = pd.DataFrame() for key in sorted_hot_ratings_i: hotcurr = df_hotels_names[df_hotels_names['hotel_id'] == key] if hotcurr['city'].to_string(index=False) == city_to_search: dicttemp = pd.DataFrame({ 'Hotel name': [hotcurr['name'].to_string(index=False)], 'Estimated rating': round(hot_ratings_i[key], 2) }) context2 = context2.append(dicttemp, ignore_index=True) # for sorting purposes, let's move it back to a df and sort it NOW... context1 = context1.sort_values(by='Estimated rating', ascending=False)[:10] context2 = context2.sort_values(by='Estimated rating', ascending=False)[:10] # let ma make it to the lists: Hotel_name_user = context1['Hotel name'].values.tolist() Estimated_rating_user = context1['Estimated rating'].values.tolist() Hotel_name_item = context2['Hotel name'].values.tolist() Estimated_rating_item = context2['Estimated rating'].values.tolist() context_rendering = { 'city': city_to_search, 'user': zip(Hotel_name_user, Estimated_rating_user), 'item__': zip(Hotel_name_item, Estimated_rating_item) } # redirect to a thank you URL: return render(request, 'search_results.html', context_rendering) # If this is a GET (or any other method) create the default form. else: form = SearchForm() context = { 'form': form, } return render(request, 'search_form.html', context)
iid: the raw id of the item returns: the number of users that have rated the item. """ try: return len(trainset.ir[trainset.to_inner_iid(iid)]) except ValueError: return 0 # we can now query for specific predicions uid = str(1) # raw user id iid = str(20) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=4, verbose=True) #print(pred) #Get a list of all animes animeid = Rating['anime_id'].unique() #Get a list of animes that uid 50 has rated animeid50 = Rating.loc[Rating['user_id'] == 50, 'anime_id'] #Remove the animes that uid 50 has rated anime_to_predict = np.setdiff1d(animeid, animeid50) testing = [[50, anime_id, 4.] for anime_id in anime_to_predict] predictions = algo.test(testing) predictions[0] pred_ratings = np.array([pred.est for pred in predictions]) print(pred_ratings)
print() print('针对歌单进行预测:') current_playlist_name = convertor.get_name_by_index(39) print('歌单名称', current_playlist_name) playlist_rid = convertor.get_rid_by_name(current_playlist_name) print('歌单rid', playlist_rid) playlist_inner_id = algo.trainset.to_inner_uid(playlist_rid) print('歌曲inid', playlist_inner_id) playlist_neighbors_inner_ids = algo.get_neighbors(playlist_inner_id, k=10) playlist_neighbors_rids = (algo.trainset.to_raw_uid(inner_id) for inner_id in playlist_neighbors_inner_ids) playlist_neighbors_names = (convertor.get_name_by_rid(rid) for rid in playlist_neighbors_rids) print() print('歌单 《', current_playlist_name, '》 最接近的10个歌单为: \n') for playlist_name in playlist_neighbors_names: print(playlist_name, algo.trainset.to_inner_uid(convertor.get_rid_by_name(playlist_name))) print() print('针对用户(单个歌单表示一个用户)进行预测:') user_inner_id = 4 print('用户内部id', user_inner_id) user_rating = trainset.ur[user_inner_id] print('用户评价过的歌曲数量', len(user_rating)) items = map(lambda x:x[0], user_rating) for song in items: print(algo.predict(user_inner_id, song, r_ui=1), convertor.get_song_name_by_iid(algo.trainset.to_raw_iid(song))) surprise.dump.dump('./knn_baseline.model', algo=algo)
import pandas as pd import numpy as np from tqdm import tqdm from surprise import KNNWithMeans, KNNBasic, KNNWithZScore, KNNBaseline from surprise import Dataset from surprise import accuracy from surprise import Reader from surprise.model_selection import train_test_split from scipy.spatial.distance import cityblock, cosine, euclidean, hamming, jaccard, rogerstanimoto data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) algo = KNNBaseline(k=50, min_k=1, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) test_pred = algo.test(testset) print('accuracy', accuracy.rmse(test_pred, verbose=True)) print('predict', algo.predict(uid=2, iid='Fight Club (1999)').est)
iid1 = str(306) # raw item id (as in the ratings file). They are **strings**! iid2 = str(514) iid3 = str(977) iid4 = str(370) r_ui1 = 4 r_ui2 = 4 r_ui3 = 1 r_ui4 = 3 verboseFlag = True # get a prediction for specific users and items. print("KNNBaseLine:") predBaseLine1 = algoBaseLine.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predBaseLine2 = algoBaseLine.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predBaseLine3 = algoBaseLine.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag) print("\nKNNBasic:") predBasic1 = algoBasic.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predBasic2 = algoBasic.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predBasic3 = algoBasic.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predBasic4 = algoBasic.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag) print("\nKNNWithMeans:") predWithMeans1 = algoWithMeans.predict(uid1, iid1, r_ui = r_ui1, verbose = verboseFlag) predWithMeans2 = algoWithMeans.predict(uid2, iid2, r_ui = r_ui2, verbose = verboseFlag) predWithMeans3 = algoWithMeans.predict(uid3, iid3, r_ui = r_ui3, verbose = verboseFlag) predWithMeans4 = algoWithMeans.predict(uid4, iid4, r_ui = r_ui4, verbose = verboseFlag)
import cPickle as pickle # 重建歌曲id到歌曲名的映射字典 song_id_name_dic = pickle.load(open("popular_song.pkl", "rb")) print("加载歌曲id到歌曲名的映射字典完成...") # 重建歌曲名到歌曲id的映射字典 song_name_id_dic = {} for song_id in song_id_name_dic: song_name_id_dic[song_id_name_dic[song_id]] = song_id print("加载歌曲名到歌曲id的映射字典完成...") user_inner_id = 4 user_rating = trainset.ur[user_inner_id] items = map(lambda x: x[0], user_rating) for song in items: print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)]) ### 使用NMF from surprise import NMF, evaluate from surprise import Dataset file_path = os.path.expanduser('./popular_music_suprise_format.txt') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 构建数据集和建模 algo = NMF() trainset = music_data.build_full_trainset() algo.train(trainset)
class Recommender(): def __init__(self, dataset, new_products): ''' Class which returns recommendations to a new customer. Initializes training data based on a full dataset. Initializes an item-item and a user-user recommender. Item-Item Recommender: - algorithm : KNNBaseline - K : 21 - sim : pearson correlation User-User Recommender: - algorithm : KNNwithMeans - K : 12 - sim : pearson correlation (for more information, see Surprise_CF.ipynb) ''' self.new_products = new_products # Append new customer to data new_data = pd.DataFrame({'customer_id':[1]*len(self.new_products), 'product_id': self.new_products, 'star_rating':[5]*len(self.new_products)}) full_data = pd.concat([new_data, dataset]).reset_index(drop=True) data = Dataset.load_from_df(full_data[['customer_id', 'product_id', 'star_rating']], Reader(rating_scale=(1, 5))) self.unique_products = dataset['product_id'].unique() self.trainset = data.build_full_trainset() self.ii_algo = KNNBaseline(k=21, sim_options={'name': 'pearson', 'user_based': False}) self.uu_algo = KNNBaseline(k=99, sim_options={'name': 'msd', 'user_based': True})#KNNWithMeans(k=12, sim_options={'name': 'pearson', 'user_based': True}) def new_recommendations(self): ''' Function that takes in a list of new products and returns recommendations. Arguments: - new_products : list of products chosen by new user - orig_data : original dataframe of users, items and ratings - algo : algorithm for predicting ratings Returns: - recs_df : dataframe of recommendations ''' # Train recommender systems self.ii_algo.fit(self.trainset) self.uu_algo.fit(self.trainset) recommendations = {'items': [], 'ii_rating': [], 'uu_rating': []} for item in self.unique_products: if item not in self.new_products: ii_rating = self.ii_algo.predict(1, item, verbose=False)[3] uu_rating = self.uu_algo.predict(1, item, verbose=False)[3] recommendations['items'].append(item) recommendations['ii_rating'].append(ii_rating) recommendations['uu_rating'].append(uu_rating) recs_df = pd.DataFrame(recommendations) ii_recs = recs_df.sort_values(by='ii_rating', ascending=False).head(10)['items'] uu_recs = recs_df.sort_values(by='uu_rating', ascending=False).head(10)['items'] return ii_recs, uu_recs
pred1 = algo1.predict(uid, iid, verbose=True) #KNNWithMeans algo2 = KNNWithMeans(k=30, sim_options={ 'name': 'cosine', 'user_based': False }, verbose=True) algo2.fit(trainset) pred2 = algo2.predict(uid, iid, verbose=True) #KNNWithZScore f algo3 = KNNWithZScore(k=30, sim_options={ 'name': 'MSD', 'user_based': True }, verbose=True) algo3.fit(trainset) pred3 = algo3.predict(uid, iid, verbose=True) #KNNBaseline algo4 = KNNBaseline(k=30, sim_options={ 'name': 'MSD', 'user_based': True }, verbose=True) algo4.fit(trainset) pred4 = algo4.predict(uid, iid, verbose=True)
for inner_id in playlist_neighbors) playlist_neighbors = (id_name_dic[playlist_id] for playlist_id in playlist_neighbors) print("之前的啥:", playlist_neighbors) print("和歌单《", current_palylist, "》最接近的10首歌单为:\n") for playlist in playlist_neighbors: print(playlist, algo.trainset.to_inner_uid(name_id_dic[current_palylist])) # 针对用户进行预测 song_id_name_dic = pickle.load(open("popular_song.pkl", "rb"), encoding='utf-8') print("加载歌曲id到歌曲名的映射字典完成...") song_name_id_dic = {} for song_id in song_id_name_dic: song_name_id_dic[song_id_name_dic[song_id]] = song_id print("加载歌曲名到歌曲id的映射字典完成...") # 内部编码的4号用户 user_inner_id = 4 user_rating = trainset.ur[user_inner_id] items = map(lambda x: x[0], user_rating) for song in items: print(algo.predict(user_inner_id, song, r_ui=1), song_id_name_dic[algo.trainset.to_raw_iid(song)]) print("完成...") # 模型存储 surprise.dump.dump('./recommendation.model', algo=algo) # 可以用以下方式载入 algo = surprise.dump.load('./recommendation.model')
bsl_options = { 'method': 'sgd', # 给定求解方式,可选值:als和sgd 'n_epochs': 10, # 迭代次数 'reg': 0.02, # 求解参数过程中的正则化系数 'learning_rate': 0.1 # 参数更新的学习率 } """ k=40: 给定预测时候的邻居样本的数目 min_k=1:在产生预测值的时候,只要要求有多少个临近用户/物品 sim_options={} : 给定相似度矩阵的计算方式 """ sim_options = { 'name': 'pearson', # 指定相似度的计算法方式,可选值:pearson\msd\cosine\pearson_baseline 'user_based': True # 指定是基于用户的协同过滤,还是基于物品的协同过滤 } algo = KNNBaseline(k=40, min_k=1, sim_options=sim_options) # 4. 模型训练 algo.fit(trainset) # 5. 模型效果评估 # TODO: surprise框架中需要单独的去设置这个效果评估的代码 # 6. 模型存储/持久化/模型预测 # 使用预测的时候必须使用predict方法,predict方法底层会调用estimate这个API产生预测值 # predict API输出的用户id和物品id必须是字符串的形式 uid = "196" iid = "242" pred = algo.predict(uid, iid) print("用户{}对于物品{}的评分为:{}".format(uid, iid, pred.est))
iid1 = str(50) # raw item id (as in the ratings file). They are **strings**! iid2 = str(1223) iid3 = str(131) iid4 = str(395) r_ui1 = 1 r_ui2 = 1 r_ui3 = 0 r_ui4 = 0 verboseFlag = True # get a prediction for specific users and items. print("KNNBaseLine:") predBaseLine1 = algoBaseLine.predict(uid1, iid1, r_ui=r_ui1, verbose=verboseFlag) predBaseLine2 = algoBaseLine.predict(uid2, iid2, r_ui=r_ui2, verbose=verboseFlag) predBaseLine3 = algoBaseLine.predict(uid3, iid3, r_ui=r_ui3, verbose=verboseFlag) predBaseLine4 = algoBaseLine.predict(uid4, iid4, r_ui=r_ui4, verbose=verboseFlag) print("\nKNNBasic:")
print(playlist, algo.trainset.to_inner_uid(name_id_dic[playlist])) # 针对用户进行预测 song_id_name_dic = pickle.load(open("popular_song.pk1", "rb")) print("加载歌曲id到歌曲名的映射字典完成......") # 重建歌曲名到歌曲id的映射字典 song_name_id_dic = {} for song_id in song_id_name_dic: song_name_id_dic[song_id_name_dic[song_id]] = song_id print("加载歌曲名到歌曲id的映射字典完成......") user_inner_id = 4 user_rating = trainset.ur[user_inner_id] items = map(lambda x: x[0], user_rating) for song in items: print(algo.predict(user_inner_id, song_id, r_ui=1), song_id_name_dic[algo.trainset]) from collections import defaultdict from surprise import SVD from surprise import Dataset def get_top_n(predictions, n=10): top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append(iid, est) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n
def run_knn_baseline(sparse_data): #filename = "test.json" prefix = "knn_baseline_" trainFile = prefix + "train.txt" testFile = prefix + "test.txt" raw_data, userPurchasedSet, userTrueTestSet = preprocess( sparse_data, trainFile, testFile) folds_files = [(trainFile, testFile)] reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() bsl_options = { 'method': 'sgd', 'n_epochs': 20, 'learning_rate': 0.005, } ### sim name: cosine msd pearson pearson_baseline ### user_based : True ---- similarity will be computed based on users ### : False ---- similarity will be computed based on items. sim_options = {'name': 'pearson_baseline', 'user_based': False} predictions = {} top_n = {} testsSet = None total_precisions = 0.0 total_recalls = 0.0 total_hit = 0.0 total_nDCG = 0.0 total_ffeature = 0.0 result_file = prefix + "result.txt" result_f = open(result_file, "w") for trainset, testset in pkf.split(data): testsSet = testset #algo = SVD(n_factors = 5) algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options) algo.fit(trainset) pre = algo.test(testset) accuracy.rmse(pre) accuracy.mae(pre) #calculate_rmse(predictions) ### test rowNum = raw_data.get_row_size() colNum = raw_data.get_col_size() cur_time = time.time() time_cost = 0 for i in range(rowNum): user = raw_data.get_userID(i) predictions[user] = set() pq = [] heapq.heapify(pq) for j in range(colNum): item = raw_data.get_itemID(j) if user not in userPurchasedSet or item in userPurchasedSet[ user]: continue value = raw_data.get_val(user, item, 'rating') predict = algo.predict(user, item, r_ui=0, verbose=False)[3] if len(pq) >= 10: heapq.heappop(pq) heapq.heappush(pq, (predict, item)) top_n[user] = set() for items in pq: top_n[user].add(items[1]) if user in userTrueTestSet: curPrecisions = calculate_precision(top_n[user], userTrueTestSet[user]) curRecalls = calculate_recall(top_n[user], userTrueTestSet[user]) ffeature = calculate_f_feature(curPrecisions, curRecalls) curHit = isHit(top_n[user], userTrueTestSet[user]) cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user]) total_precisions += curPrecisions total_recalls += curRecalls total_hit += curHit total_nDCG += cur_nDCG total_ffeature += ffeature result_f.write(user + "\t" + str(curPrecisions) + "\t" + str(curRecalls) + "\t" + str(ffeature) + "\t" + str(curHit) + '\t' + str(cur_nDCG) + "\n") if i != 0 and i % 1000 == 0: duration = (time.time() - cur_time) / 60 time_cost += duration remaining_time = ((rowNum - i) / 1000) * duration cur_time = time.time() #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min' print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG rowNum = raw_data.get_row_size() print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str( total_ffeature / rowNum ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" + str(total_recalls / rowNum) + "\t" + str(total_ffeature / rowNum) + "\t" + str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) + "\n") result_f.close()
class Recommender: def __init__(self): self.__load_rating_data_set() self.__load_movies_set() self.train_model() def train_model(self): self.__load_training_set() # Using KNN sim_options = {'name': 'pearson_baseline', 'user_based': True} self.algo = KNNBaseline(k=25, sim_options=sim_options) # Training the model self.algo.fit(self.training_set) def __get_not_rated_movies(self, user_id): m_ratings = \ self.ratings_set.loc[ self.ratings_set.user_id == user_id] rated_movies = list(m_ratings.item_id) not_rated = \ [mid if (mid not in rated_movies) else None for mid in self.ratings_set.item_id.unique()] return list(not_rated) def __filter_by_movies_saved_in_kitso(self, preditions): kitso_movies_ids = list(self.movies_set.id) movies_id = list(map(lambda tupl: tupl[0], preditions)) return list( filter(lambda movie_id: movie_id in kitso_movies_ids, movies_id)) def __search_in_list_of_tuples(self, elem, list_tuples): tuples_with_elem = list(filter(lambda tup: elem in tup, list_tuples)) return elem if len(tuples_with_elem) > 0 else False def __predict_rating(self, user_id, movies_ids): predicted_rating = [] for mid in movies_ids: predition = self.algo.predict(user_id, mid) predition_tuple = (mid, float(predition.est)) if not self.__search_in_list_of_tuples(mid, predicted_rating): predicted_rating.append(predition_tuple) return predicted_rating def get_top_n_recommended_movies(self, user_id, n=5): not_rated = self.__get_not_rated_movies(user_id) rating_preditions = self.__predict_rating(user_id, not_rated) rating_preditions = self.__filter_by_movies_saved_in_kitso( rating_preditions) sorted_preditions = sorted(rating_preditions, key=itemgetter(1), reverse=True)[:n] response = self.movies_set[self.movies_set.id.isin(sorted_preditions)] return jsonify(response.to_dict('records')) def __load_training_set(self): reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( self.ratings_set[['user_id', 'item_id', 'rating']], reader) self.training_set = data.build_full_trainset() def __load_movies_set(self): self.movies_set = pd.read_csv(FILE_PATH_MOVIES, delimiter=';', encoding='latin-1') def __load_rating_data_set(self): self.ratings_set = pd.read_csv(FILE_PATH_RATINGS, delimiter=';')