def ComputeCollaborativeFiltering_Item_Item(recipe_df, train_rating_df, pd, benchmark, knnmeans=False): print("\n###### Compute CollaborativeFiltering_Item_Item ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) # compute similarities between items sim_options = {'name': 'cosine', 'user_based': False} if knnmeans: algo = KNNWithMeans(sim_options=sim_options, verbose=False) else: algo = KNNBasic(sim_options=sim_options, verbose=False) algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def user_collaborative_filtering(trainset, testset): # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) # we can now query for specific predicions uid = str(196) # raw user id iid = str(302) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algo.test(testset) # get RMSE print("User-based Model : Test Set") accuracy.rmse(test_pred, verbose=True)
class KNNMean: def __init__(self, data, rating_scale, k=50, min_k=1, sim_options=None): self.data = data self.rating_scale = rating_scale self.k = k self.min_k = min_k self.reader = Reader(rating_scale=self.rating_scale) if not sim_options: sim_options = { "name": "cosine", 'min_support': 3, "user_based": False } # Compute similarities between items self.model_data = Dataset.load_from_df( data.loc[:, ["userId", "movieId", "rating"]], self.reader) self.trainset = self.model_data.build_full_trainset() self.model = KNNWithMeans(self.k, self.min_k, sim_options=sim_options) print('fitting KNNWithMeans model...') self.model.fit(self.trainset) self.grid_search_ = None def set_model_params(self, model_params): print('updating model parameters...') self.model = KNNWithMeans(model_params) print('fitting KNNWithMeans model...') self.model.fit(self.trainset) def update_grid_search(self, gs): self.grid_search_ = gs def fit(self, data): self.data = data self.model_data = Dataset.load_from_df( data.loc[:, ["userId", "movieId", "rating"]], self.reader) self.trainset = self.model_data.build_full_trainset() self.model.fit(self.trainset) def grid_search(self): print('grid search...') sim_options = { "name": ["msd", "cosine"], "min_support": [3, 4], "user_based": [False] } param_grid = { "sim_options": sim_options, "k": [50, 100, 200], "min_k": [1] } gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(self.model_data) best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"] print(f'Best score (RMSE): {best_score}') print(f'Best params (RMSE): {best_params}') print(f'Best score (MAE): {gs.best_score["mae"]}') print(f'Best params (RMSE): {gs.best_params["mae"]}') self.set_model_params(best_params) return best_params def predict(self, test_data): ratings = test_data.apply( lambda x: self.model.predict(x['userId'], x['movieId']).est, axis=1) return ratings
evaluator.AddAlgorithm(UserKNN1, "User KNNBasic") # Item-based KNN ItemKNN1 = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) evaluator.AddAlgorithm(ItemKNN1, "Item KNNBasic") ############### ###### KNNWithZScore # User-based KNN UserKNN2 = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True}) evaluator.AddAlgorithm(UserKNN2, "User KNNWithZScore") # Item-based KNN ItemKNN2 = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': False}) evaluator.AddAlgorithm(ItemKNN2, "Item KNNWithZScore") ############### ###### KNNWithMeans # User-based KNN UserKNN3 = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}) evaluator.AddAlgorithm(UserKNN3, "User KNNWithMeans") # Item-based KNN ItemKNN3 = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False}) evaluator.AddAlgorithm(ItemKNN3, "Item KNNWithMeans") ############### ###### KNNBaseline # User-based KNN UserKNN4 = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True}) evaluator.AddAlgorithm(UserKNN4, "User KNNBaseline") # Item-based KNN ItemKNN4 = KNNBaseline(sim_options={'name': 'cosine', 'user_based': False}) evaluator.AddAlgorithm(ItemKNN4, "Item KNNBaseline") ############### # Just make random recommendations
from surprise import accuracy from collections import defaultdict import pprint # 数据读取 path = './movielens_sample.txt' df = pd.read_csv(path, usecols=[0, 1, 2], skiprows=1) df.columns = ['user', 'item', 'rating'] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_df(df, reader=reader) trainset = data.build_full_trainset() # ItemCF 计算得分 # 取最相似的用户计算时,只取最相似的k个 kf = KFold(n_splits=5) algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=True) print(rmse, rmse * rmse) predictions = [] for row in df.itertuples(): user, item = getattr(row, 'user'), getattr(row, 'item') predictions.append([user, item, algo.predict(user, item).est]) print("*" * 100) print("user\titem\tpredict\n") pprint.pprint(predictions)
class KNNWithMeansRecommender(SurpriseRecommender): """Generates recommendations via KNNWithMeans, see https://surprise.readthedocs.io/en/stable/knn_inspired.html """ algo = KNNWithMeans()
cross_validate = False list_reviews = read_datafile(data_file) df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime']) #filter_dataset(df) #normalize_playtime(df) reader = Reader(rating_scale=(0, max(df.Playtime))) sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) if cross_validate: data = Dataset.load_from_df(df, reader) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) else: train_df, test_df = train_test_split(df, test_size=0.2) train_data = Dataset.load_from_df(train_df, reader) training_set = train_data.build_full_trainset() algo.fit(training_set) for index, row in test_df.iterrows(): user = row['UserId'] item = row['ItemId'] playtime = row['Playtime']
algo_svd.fit(trainset) predictions = algo_svd.test(trainset.build_anti_testset()) predictions_svd = algo_svd.test(testset) pred_svd = pd.DataFrame(predictions_svd) r.loc[(r['user_id'] == 27523) & (r['book_id'] == 2203)] SVD().fit SVD().fit(trainset) SVDpp().fit(trainset) KNNBasic(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset) KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset) KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset) KNNBasic(sim_options={'name': 'cosine', 'user_based': False}).fit(trainset) KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False}).fit(trainset) KNNWithZScore(sim_options={ 'name': 'cosine', 'user_based': False }).fit(trainset) SlopeOne().fit(trainset) BaselineOnly().fit(trainset) NormalPredictor().fit(trainset) SVD().fit(trainset) SVDpp().fit(trainset) KNNBasic(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset) KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
def selfmade_approach(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) # 1st approach: Calculate for a single user contained in dataset: target_user_key = 286189 target_user_info = df[df['user_key'] == target_user_key] # Estimate single game: target_game_key = 100098 # data structures: # sim_matrix = ndarray(312,312) # xr = defaultdict: 312 # yr = defaultdict 8787 # later on replace these by self-written structures xr = algo.xr yr = algo.yr sim_matrix = algo.sim item_means = algo.means inner_target_uid = algo.trainset.to_inner_uid(target_user_key) inner_target_iid = algo.trainset.to_inner_iid(target_game_key) # switch: uid and idd: x = inner_target_uid y = inner_target_iid # pred2: inner_2_raw_item_ids = algo.trainset._raw2inner_id_items # swap keys and values: inner_2_raw_item_ids = dict( (v, k) for k, v in inner_2_raw_item_ids.items()) # similarity matrix with raw ids instead of inner surprise ids: sim_matrix_df = pd.DataFrame(sim_matrix) sim_matrix_df = sim_matrix_df.rename( columns=lambda x: inner_2_raw_item_ids[x]) sim_matrix_df = sim_matrix_df.rename( index=lambda x: inner_2_raw_item_ids[x]) target_user_ratings = yr[x] # convert from inner to raw: target_user_ratings2 = [] for (inner_iid, rating) in target_user_ratings: target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating)) # convert item means from inner to raw: item_means2 = {} for i, mean in enumerate(item_means): item_means2[inner_2_raw_item_ids[i]] = mean myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df, target_user_ratings=target_user_ratings2, item_means=item_means2, k=k, min_k=min_k) pred = myKNN.predict_single_game(user_key=target_user_key, game_key=target_game_key) pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid) estimate = pred print("Estimate for user %s for game %s is %s" % (target_user_key, target_game_key, estimate)) # Estimate for user not contained in dataset: target_user_key = 123456789 target_game_key = 100098 user_ratings = [ (100284, 7), (100311, 8), (105154, 2), (100020, 4), (100001, 9), (100277, 7), ] myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k) prediction = myKNN2.predict_single_game(target_user_key, target_game_key) # export similarity matrix: sim_matrix_df.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise.csv') # export item means: export_path = '../Data/Recommender/item-means.json' with open(export_path, 'w') as fp: json.dump(item_means2, fp, sort_keys=False, indent=4) test = sim_matrix_df.loc[100516, 100284] pass
predictions = algo.test(testset) Prec, Reca = metrics(predictions, t) pr = pr + Prec re = re + Reca return pr / 10.0, re / 10.0 if __name__ == '__main__': data = retrieve_data() G_max = ret_mod_user_dict(data) algo_NMF = NMF(NMF_no_of_LF, verbose=False) algo_SVD = SVD(n_factors=MF_no_of_LF) algo_KNN = KNNWithMeans(k=KNN_no_of_LF, sim_options=sim_options, verbose=False) # Q36 Pr1 = [] Re1 = [] t = list(range(1, 26)) for l in t: Precision, Recall = cross_val_(data, G_max, l, algo_KNN) Pr1.append(Precision) Re1.append(Recall) plotgraphs(t, Pr1, "Number of Suggestions", "Precision", "Precision Curve for KNN") plotgraphs(t, Re1, "Number of Suggestions", "Recall", "Recall Curve for KNN")
def browse(uid): #Step 1: data import and prep #establish connection cnx = sql.connect(user='******', password='******', host='localhost', database='moviesurprise') cursor = cnx.cursor() #execute query query = ("SELECT User_ID, Movie_ID, rating FROM ratings") query_sur = ("SELECT User_ID, Movie_ID, surpriseRating FROM ratings") query_baseline = ("SELECT Movie_ID from movies where Movie_ID < 6") query_baseline_2 = ( "SELECT * from movies where Movie_ID > 6 AND Movie_ID < 12") query_baseline_3 = ( "SELECT * from movies where Movie_ID > 12 AND Movie_ID < 18") cursor.execute(query) #convert cursor data to list l = list() for x in cursor: l.insert(len(l), x) cursor.execute(query_sur) l_sur = list() for x in cursor: l_sur.insert(len(l_sur), x) cursor.execute(query_baseline) l_baseline = dict() l_baseline = cursor.fetchall() cursor.execute(query_baseline_2) l_baseline_2 = dict() l_baseline_2 = cursor.fetchall() cursor.execute(query_baseline_3) l_baseline_3 = dict() l_baseline_3 = cursor.fetchall() with open('baseline_recs_pickle.pkl', 'wb') as pickle_file: pickle.dump(l_baseline, pickle_file) with open('baseline_recs2_pickle.pkl', 'wb') as pickle_file: pickle.dump(l_baseline_2, pickle_file) with open('baseline_recs3_pickle.pkl', 'wb') as pickle_file: pickle.dump(l_baseline_3, pickle_file) #close connection cursor.close() cnx.close() df = pd.DataFrame(l, columns=["User_ID", "Movie_ID", "rating"]) df_sur = pd.DataFrame(l_sur, columns=["User_ID", "Movie_ID", "surpriseRating"]) reader = Reader(rating_scale=(1.0, 5.0)) reader_sur = Reader(rating_scale=(-2.0, 2.0)) data = Dataset.load_from_df(df, reader=reader) data_sur = Dataset.load_from_df(df_sur, reader_sur) trainsetfull = data.build_full_trainset() trainsetfull_sur = data_sur.build_full_trainset() #print("Number of users: ", trainsetfull.n_users,'\n') #print("Number of items: ", trainsetfull.n_items, '\n') #Step 2: cross-validate my_k = 300 my_min_k = 5 my_sim_options = {'name': 'pearson', 'user_based': False} algo = KNNWithMeans(k=my_k, min_k=my_min_k, sim_options=my_sim_options, verbose=False) algo_sur = KNNWithMeans(k=my_k, min_k=my_min_k, sim_options=my_sim_options, verbose=False) #results = cross_validate(algo = algo, data = data, measures = ['RMSE'], cv=5, return_train_measures = True) #print(results['test_rmse'].mean()) #Step 3: fit the model algo.fit(trainsetfull) algo_sur.fit(trainsetfull_sur) #Step 4: prediction m_ids = get_movie_ids() #dict where key is the movie id and the value is the predicted rating d_med = dict() d_best = dict() d_sur = dict() for m_id in m_ids: get_med_recs(algo, uid, m_id, d_med) get_best_recs(algo, uid, m_id, d_best) get_sur_recs(algo, uid, m_id, d_sur) #convert d to a sorted 2d list (sorted in asc. order of rating estimate) where sort[x][0] is the predicted rating and sort[x][1] is the movie id sort = sorted(d_best.items(), key=lambda x: x[1]) top_recs = sort with open('top_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(top_recs, pickle_file) #returns top 5 movies #print("Movies you will definitely like: ") #for item in top_recs: #print(item) #print("") sort = sorted(d_med.items(), key=lambda x: x[1]) med_recs = sort with open('med_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(med_recs, pickle_file) #print("Movies you might like: ") #for item in med_recs: #print(item) #print("") #step 5: repeat 3 and 4 for surprise data sort = sorted(d_sur.items(), key=lambda x: x[1]) sur_recs = sort with open('sur_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(sur_recs, pickle_file) #print("Movies you might be surprised to like: ") #for item in sur_recs: #print(item) #step 6: grab movieId info definintely_like = [[0 for x in range(5)] for y in range(5)] #change range when DB has links somewhat_like = [[0 for x in range(5)] for y in range(5)] #change range when DB has links surprisingly_like = [[0 for x in range(5)] for y in range(5)] #change range when DB has links with open('definintely_like_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(definintely_like, pickle_file) with open('somewhat_like_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(somewhat_like, pickle_file) with open('surprisingly_like_pickle' + str(uid) + '.pkl', 'wb') as pickle_file: pickle.dump(surprisingly_like, pickle_file) for i in range(len(definintely_like)): try: movie = random.randint(0, top_recs.__len__() - 1) definintely_like[i][0] = top_recs[movie][0] #id definintely_like[i][1] = top_recs[movie][1] #est definintely_like[i][2] = id_to_title(top_recs[movie][0]) #title definintely_like[i][3] = id_to_avg_rating( top_recs[movie][0]) #avg rating definintely_like[i][4] = id_to_posterlink( top_recs[movie][0]) #poster link except ValueError: print("Unable to pull recs for this user. Using baseline recs.") definintely_like[i][0] = l_baseline[i][0] #id definintely_like[i][1] = l_baseline[i][0] #est definintely_like[i][2] = id_to_title(l_baseline[i][0]) #title definintely_like[i][3] = id_to_avg_rating( l_baseline[i][0]) #avg rating definintely_like[i][4] = id_to_posterlink( l_baseline[i][0]) #poster link for i in range(len(somewhat_like)): try: movie = random.randint(0, med_recs.__len__() - 1) somewhat_like[i][0] = med_recs[movie][0] #id somewhat_like[i][1] = med_recs[movie][1] #est somewhat_like[i][2] = id_to_title(med_recs[movie][0]) #title somewhat_like[i][3] = id_to_avg_rating( med_recs[movie][0]) #avg rating somewhat_like[i][4] = id_to_posterlink( med_recs[movie][0]) #poster link except ValueError: #print("Unable to pull recs for this user. Using baseline recs.") somewhat_like[i][0] = l_baseline_2[i][0] #id somewhat_like[i][1] = l_baseline_2[i][0] #est somewhat_like[i][2] = id_to_title(l_baseline_2[i][0]) #title somewhat_like[i][3] = id_to_avg_rating( l_baseline_2[i][0]) #avg rating somewhat_like[i][4] = id_to_posterlink( l_baseline_2[i][0]) #poster link for i in range(len(surprisingly_like)): try: movie = random.randint(0, sur_recs.__len__() - 1) surprisingly_like[i][0] = sur_recs[movie][0] #id surprisingly_like[i][1] = sur_recs[movie][1] #est surprisingly_like[i][2] = id_to_title(sur_recs[movie][0]) #title surprisingly_like[i][3] = id_to_avg_rating( sur_recs[movie][0]) #avg rating surprisingly_like[i][4] = id_to_posterlink( sur_recs[movie][0]) #poster link except ValueError: #print("Unable to pull recs for this user. Using baseline recs.") surprisingly_like[i][0] = l_baseline_3[i][0] #id surprisingly_like[i][1] = l_baseline_3[i][0] #est surprisingly_like[i][2] = id_to_title(l_baseline_3[i][0]) #title surprisingly_like[i][3] = id_to_avg_rating( l_baseline_3[i][0]) #avg rating surprisingly_like[i][4] = id_to_posterlink( l_baseline_3[i][0]) #poster link #print("Movies you will definitely like:\n") #for x in definintely_like: #print(x) #print("Movies you might like like:\n") #for x in somewhat_like: #print(x) #print("Movies you may be surprised by:\n") #for x in surprisingly_like: #print(x) #each list has the following format: id, estimated rating, title, average rating, link print("Recommendation generation for user " + str(uid) + " complete\n") return definintely_like, somewhat_like, surprisingly_like
movieID = info[1] if (userID in movies_watched): movies_watched[int(userID)].append(int(movieID)) else: movies_watched[int(userID)] = [int(movieID)] indications = dict() moviesIds = set(movies.keys()) for i in movies_watched: moviesUserWatched = set(movies_watched[i]) indications[i] = moviesIds.difference(moviesUserWatched) data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.2) algo = KNNWithMeans(k=4, sim_options={'name': 'cosine', 'user_based': True}) algo.fit(trainset) predictions = algo.test(testset) rmse_knn = accuracy.rmse(predictions, verbose=False) def top5Movies(userId): indicationsByRating = dict() print(userId) indicationsByUser = list(indications[userId]) for i in indicationsByUser: indicationsByRating[i] = algo.predict(uid=str(userId), iid=str(i)).est indicationsByRating = sorted(indicationsByRating.items(), key=lambda x: x[1],
anime_info = pd.read_csv(anime_info_path, sep="\t") # print(anime_info.head()) anime_ratings = pd.read_csv(anime_ratings_path, sep='\t') reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(anime_ratings[['User_ID', "Anime_ID", "Feedback"]], reader) # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } model = KNNWithMeans(sim_options=sim_options) trainingSet = data.build_full_trainset() model.fit(trainingSet) # print ("ok") def top_5(): top_movies = anime_info.sort_values( by='rating', ascending=False) return top_movies[:5] def top_5_recommendations(uid):
reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(dataset, reader) # In[ ]: trainset, testset = train_test_split(data, test_size=.15) # In[ ]: algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True}) algo.fit(trainset) # In[ ]: test_pred = algo.test(testset) # In[ ]: accuracy.rmse(test_pred, verbose=True)
from surprise import KNNWithMeans from surprise import Dataset, print_perf, Reader from surprise.model_selection import cross_validate import os # 指定文件所在路径 file_path = os.path.expanduser('mydata.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() # Use user_based true/false to switch between user-based or item-based collaborative filtering algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个 algo.fit(trainset) # we can now query for specific predicions uid = str(5) # raw user id iid = str(1) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1 #---------------------------- uid = str(5) # raw user id iid = str(5) # raw item id # get a prediction for specific users and items. pred = algo.predict(uid, iid) print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
def benchmark_different_algorithms(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) ## Surprise: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) results = [] algorithms = [ 'SVD\t\t\t\t\t\t', 'SlopeOne\t\t\t\t', 'CoClustering\t\t\t', 'NMF\t\t\t\t\t\t', 'KNN_Basic Item-Item\t\t', 'KNN_WithMeans Item-Item\t', 'KNN_WithZScore Item-Item', 'KNN_Basic User-User\t\t', 'KNN_WithMeans User-User\t', 'KNN_WithZScore User-User' ] # 1) SVD algo = SVD() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 2) Slope One algo = SlopeOne() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 3) CoClustering algo = CoClustering() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 4) NMF algo = NMF() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) ## K-Nearest Neighbors - Item-Item sim_option = {'name': 'cosine', 'user_based': False} k = 40 min_k = 5 # 5) KNNBasic algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 6) KNNWithMeans algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 7) KNNWithZScore algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) ## K-Nearest Neighbors - User - User sim_option = {'name': 'cosine', 'user_based': True} k = 100 min_k = 2 # 8) KNNBasic algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 9) KNNWithMeans algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 10) KNNWithZScore algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) for algorithm, result in zip(algorithms, results): print(algorithm + '\t \t RMSE Score: \t' + str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' + str(result['fit_time']) + '\t\t Train-Time: ' + str(result['test_time']))
rmse = accuracy.rmse(predictions, verbose=True) if rmse < best_rmse: best_algo = algo best_rmse = rmse best_pred = predictions pass pass print("ok") print(f"best RMSE {best_rmse}") print("KNNWithMeans") kf = KFold(n_splits=5) sim_options = {'name': 'cosine'} algo = KNNWithMeans(sim_options=sim_options) best_algo = None best_rmse = 1000.0 best_pred = None for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error rmse = accuracy.rmse(predictions, verbose=True) if rmse < best_rmse: best_algo = algo best_rmse = rmse best_pred = predictions pass pass
def create_similarity_matrix(): start_time = time.time() # import reviews: import_path = '../Data/Joined/Results/Reviews_Reduced.csv' df = pd.read_csv(import_path) # keep only important columns: df = df[['game_key', 'user_key', 'rating']] # create surprise algorithm object sim_option = {'name': 'pearson', 'user_based': False} algo = KNNWithMeans(sim_options=sim_option) # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainset_full = data.build_full_trainset() print('Number of users: ', trainset_full.n_users, '\n') print('Number of items: ', trainset_full.n_items, '\n') # fit similarity matrix and calculate item means: algo.fit(trainset_full) print("--- %s seconds ---" % (time.time() - start_time)) # save similarity matrix and means from algo object to variable sim_matrix = algo.sim item_means = algo.means # convert numpy array to pd df: sim_matrix = pd.DataFrame(sim_matrix) # replace inner ids with raw ids: raw_2_inner_ids = trainset_full._raw2inner_id_items # swap keys and values: inner_2_raw_item_ids = dict((v, k) for k, v in raw_2_inner_ids.items()) # replace inner ids in sim_matrix index and columns by game_keys: sim_matrix = sim_matrix.rename(index=inner_2_raw_item_ids) sim_matrix = sim_matrix.rename(columns=inner_2_raw_item_ids) # export sim_matrix: sim_matrix.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset.csv' ) # convert item means from inner to raw: item_means_raw_ids = {} for i, mean in enumerate(item_means): item_means_raw_ids[inner_2_raw_item_ids[i]] = mean # export item means: export_path = '../Data/Recommender/item-means-Reduced_dataset.json' with open(export_path, 'w') as fp: json.dump(item_means_raw_ids, fp, sort_keys=False, indent=4) ## create sim matrix in long format: # get index as column: column_names = list(sim_matrix.columns.values) sim_matrix.reset_index(level=0, inplace=True) # convert df from wide to long: sim_matrix_long = pd.melt(sim_matrix, id_vars='index', value_vars=column_names, var_name='game_key_2') sim_matrix_long.rename(columns={'index': 'game_key'}) # export long sim matrix: sim_matrix_long.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset-LONG_FORMAT.csv' ) print("--- %s seconds ---" % (time.time() - start_time)) print('function end reached')
from surprise import KNNWithMeans from surprise import Dataset, Reader from surprise import accuracy from surprise.model_selection import KFold import time startTime = time.time() # 数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) trainset = data.build_full_trainset() # ItemCF 计算得分 # 取最相似的用户计算时,只取最相似的k个 algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'}) # 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) # 计算MAE accuracy.mae(predictions, verbose=True) # algo.fit(trainset) uid = str(196) iid = str(302)
# Gridsearch KNNBasic param_grid = {'k': [22, 24, 26, 28, 30]} print(surprise_gridsearch(param_grid, KNNBasic, data)) # Cross-Validate KNNBasic sim_options = {'name': 'MSD', 'user_based': False} algo = KNNBasic(k=26, sim_options=sim_options) surprise_cross_validate(algo, data, sim_options) # Gridsearch KNNWithMeans param_grid = {'k': [37, 38, 39, 40, 41, 42, 43]} print(surprise_gridsearch(param_grid, KNNWithMeans, data)) # Cross-Validate KNNWithMeans sim_options = {'name': 'MSD', 'user_based': False} algo = KNNWithMeans(k=42, sim_options=sim_options) surprise_cross_validate(algo, data, sim_options) # Gridsearch KNNBaseline param_grid = {'k': [18, 19, 20, 21, 22]} print(surprise_gridsearch(param_grid, KNNBasic, data)) # Cross-Validate KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(k=19, sim_options=sim_options) surprise_cross_validate(algo, data, sim_options) # Predictions trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} algo = KNNBaseline(k=19, sim_options=sim_options)
def make_alg_and_test(trainset, testset): """ This function for: create the algorithm and run the algorithm on test dataset. Args: trainset, testset Return: Try other config in sim_options: name : contains the similarity metric to use. Options are cosine, msd, pearson, or pearson_baseline. The default is msd. user_based : a boolean that tells whether the approach will be user-based or item-based. The default is True, which means the user-based approach will be used. min_support: the minimum number of common items needed between users to consider them for similarity. For the item-based approach, this corresponds to the minimum number of common users for two items. """ cfg = [] sim_options0 = {'name': 'pearson_baseline', 'user_based': False} cfg.append(sim_options0) # To use item-based cosine similarity sim_options1 = { "name": "cosine", "user_based": False, # Compute similarities between items "min_support": 3, } cfg.append(sim_options1) sim_options2 = { "name": "msd", "user_based": False, } cfg.append(sim_options2) sim_options3 = { "name": "cosine", "user_based": False, "min_support": 4, } cfg.append(sim_options3) sim_options4 = { "name": "msd", "user_based": False, "min_support": 5, } cfg.append(sim_options4) sim_options5 = { "name": "cosine", "user_based": False, "min_support": 5, } cfg.append(sim_options5) for index in range(len(cfg)): algo = KNNWithMeans(k=5, sim_options=cfg[index]) algo.fit(trainset) # run the trained model against the testset test_pred = algo.test(testset) logging.info(test_pred[20]) # get RMSE logging.info( f"With index config : {index} , rmse on Test Set = {accuracy.rmse(test_pred, verbose=True)}" )
''' table = [] for klass in classes: start = time.time() if klass == 'SVD': algo = SVD() elif klass == 'SVDpp': algo = SVDpp() elif klass == 'NMF': algo = NMF() elif klass == 'SlopeOne': algo = SlopeOne() elif klass == 'KNNBasic': algo = KNNBasic() elif klass == 'KNNWithMeans': algo = KNNWithMeans() elif klass == 'KNNBaseline': algo = KNNBaseline() elif klass == 'CoClustering': algo = CoClustering() elif klass == 'BaselineOnly': algo = BaselineOnly() else: algo = NormalPredictor() #cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) # Precision and recall can then be averaged over all users prec = sum(p for p in precisions.values()) / len(precisions)
# Ratings rcols = ['userId', 'movieId', 'rating'] ml_ratings_training = pd.read_csv('../data/final_py_data_training.csv', usecols=rcols) # Convert to Surprise Ratings reader = Reader(rating_scale=(0.5, 5)) surprise_training = Dataset.load_from_df(ml_ratings_training, reader=reader).build_full_trainset() # Train algorithm i_min_k = 5 i_max_k = 100 sim_options_item = {'name': 'pearson', 'user_based': False} algo_item = KNNWithMeans(k=i_max_k, min_k=i_min_k, sim_options=sim_options_item) algo_item.fit(surprise_training) class item_CF_model(ccobra.CCobraModel): def __init__(self, name='Item_CF'): super(item_CF_model, self).__init__(name, ["recommendation"], ["single-choice"]) def predict(self, item, **kwargs): user_id = item.identifier movie_id = int(eval(item.task[0][0])) # Prediction form predict_form = [[user_id, movie_id, 1]]
unrated_df = unrated_df.sort_values('Rating', ascending=False) unrated_df = unrated_df.head() rated_df = rated_df.sort_values('Rating', ascending=False) rated_df = rated_df.head() # stores top 5 movies predicted finalu = [] # stores top 5 movies watched already finalr = [] for i in range(0, 5): finalu.append(movies.iloc[int(unrated_df.iloc[i][0])][1]) finalr.append(movies.iloc[int(rated_df.iloc[i][0])][1]) table = { 'Test User Id': userInput, ('Predicted movies', 'Movies'): finalu, ('Predicted movies', 'Ratings'): unrated_df['Rating'].tolist(), ('Movies seen in past', 'Movies'): finalr, ('Movies seen in past', 'Ratings'): rated_df['Rating'].tolist() } return table # loading dataset data = Dataset.load_builtin('ml-1m') algo = KNNWithMeans(k=10) # Run 5-fold cross-validation and print results. cross_validate(algo, data, measures=['MAE'], cv=5, verbose=True)
file_path = os.path.expanduser('./data/163_music_suprise_format.txt') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 计算歌曲和歌曲之间的相似度 print "构建数据集..." trainset = music_data.build_full_trainset() #sim_options = {'name': 'pearson_baseline', 'user_based': False} #查找最近的user print "开始训练模型..." #sim_options = {'user_based': False} #algo = KNNBaseline(sim_options=sim_options) algo = KNNWithMeans() algo.train(trainset) current_playlist = list(name_id_dic.keys())[39] print "歌单名称", current_playlist # 取出近邻 # 映射名字到id playlist_id = name_id_dic[current_playlist] print "歌单id", playlist_id # 取出来对应的内部user id => to_inner_uid playlist_inner_id = algo.trainset.to_inner_uid(playlist_id) print "内部id", playlist_inner_id playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)
def compare_model_algorithms(data, Nrep=2, Nfolds=5): """ Prints out model performances and run times for standard algorithms in Surprise. Input: data = surprise data object Nrep = number of iterations with different folds Nfolds = number of cross validation folds Output: performance_list = list of performance matrices with rows: RMSE, MAE, time(min); and cols: Algorithm (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor) performance = average over lists in performance_list """ # set RNG np.random.seed(0) random.seed(0) # set KNN algorithm options user_opt_cos = {"name": "cosine", "user_based": True} item_opt_cos = {"name": "cosine", "user_based": False} # The algorithms to cross-validate s_SVD = SVD() s_SVDpp = SVDpp() s_NMF = NMF() s_SlopeOne = SlopeOne() u_KNNBasic = KNNBasic(sim_options=user_opt_cos) u_KNNWithMeans = KNNWithMeans(sim_options=user_opt_cos) u_KNNBaseline = KNNBaseline(sim_options=user_opt_cos) i_KNNBasic = KNNBasic(sim_options=item_opt_cos) i_KNNWithMeans = KNNWithMeans(sim_options=item_opt_cos) i_KNNBaseline = KNNBaseline(sim_options=item_opt_cos) s_CoClustering = CoClustering() s_BaselineOnly = BaselineOnly() s_NormalPredictor = NormalPredictor() classes = [ s_SVD, s_SVDpp, s_NMF, s_SlopeOne, u_KNNBasic, u_KNNWithMeans, u_KNNBaseline, i_KNNBasic, i_KNNWithMeans, i_KNNBaseline, s_CoClustering, s_BaselineOnly, s_NormalPredictor ] class_names = [ "SVD", "SVDpp", "NMF", "SlopeOne", "user-KNNBasic", "user-KNNWithMeans", "user-KNNBaseline", "item-KNNBasic", "item-KNNWithMeans", "item-KNNBaseline", "CoClustering", "BaselineOnly", "NormalPredictor" ] # repeat cross validation for different kfold splits for higher reliability performance_list = [] headers = ['RMSE', 'MAE', 'Time (min)'] for irep in range(0, Nrep): # cross validation folds will be the same for all algorithms. kf = KFold(n_splits=Nfolds, random_state=0) # cross validate for each algorithm table = np.zeros((len(classes), len(headers))) for ik, klass in enumerate(classes): start = time.time() out = cross_validate(klass, data, ['rmse', 'mae'], kf) cv_time = (time.time() - start) / 60 mean_rmse = np.mean(out['test_rmse']) mean_mae = np.mean(out['test_mae']) table[ik, :] = np.array([mean_rmse, mean_mae, cv_time]) # Accumulate results for each cross-validation performance_list.append(table) # Show averaged results over cross validation iterations performance = sum(performance_list) / len(performance_list) print( tabulate(performance.tolist(), headers=headers, showindex=class_names)) return performance_list, performance
reader = Reader(rating_scale=(1, 5)) from surprise import KNNWithMeans from surprise import KNNBasic import heapq from collections import defaultdict from operator import itemgetter # To use item-based cosine similarity sim_options = { "name": "cosine", "user_based": True, # Compute similarities between items } model = KNNWithMeans(sim_options=sim_options) #profiledata = pd.read_csv("C:\\Users\\juyee\\Envs\\sih2020\\candidate_recommender\\Test Profiles\\profile_data.csv") df = pd.read_csv(r"./JVR_CandidatesInfo2.csv") df = pd.read_csv(r"JVR_CandidatesInfo2.csv") df = df.replace(np.nan, '', regex=True) df = df.rename(columns={'Unnamed: 0': 'ind'}) #pdf_files = glob.glob("C:\\Users\\juyee\\Desktop\\Web scraping\\Test Profiles\\*.csv") app = Flask(__name__) def getCandidateName(candidateid): name = df[df["ind"] == candidateid]["Name"]
def set_model_params(self, model_params): print('updating model parameters...') self.model = KNNWithMeans(model_params) print('fitting KNNWithMeans model...') self.model.fit(self.trainset)
def make_prediction(test_data_imdb): train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv', sep=',').drop(columns={'Unnamed: 0'}) omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv') # build a reader, define the rating scale (minimum and maximum value) reader = Reader(rating_scale=(0.5, 5)) # convert data to surprise format train_surprise = Dataset.load_from_df(train_data, reader).build_full_trainset() # Collaborative Filtering Models knn_collaborative = KNNWithMeans(k=115, min_k=5, sim_options={ 'name': 'msd', 'user_based': False }) knn_collaborative.fit(train_surprise) svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23) svd.fit(train_surprise) preds = [[ knn_collaborative.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ], [ svd.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ]] # Content-Based Models # define features for content-based models params_features = { 'threshold_actors': 0, 'ts_languages': 0, 'year': True, 'runtime': True, 'imdbvotes': True, 'series': False, 'awards': False, 'genres': True, 'imdb_rating': True, 'roto_rating': True, 'pg_rating': True, 'threshold_newkeywords': 0, 'threshold_plots': 0, 'threshold_directors': 0 } # load features features, names = preprocessing.features(**params_features) # add imdbID and set as index features = omdb[['imdbID' ]].join(pd.DataFrame(features)).set_index('imdbID') # predict ratings pred_content = [] no_of_ratings = [] train_data = train_data[train_data['imdbID'] != 'tt0720339'] for row in test_data_imdb.itertuples(): # select user and movie imdbID = row.imdbID userID = row.user_id # compute predictions if imdbID == 'tt0720339': # exclude outlier movie without information pred_content.append(svd.predict(userID, imdbID).est) else: # select ratings of the user ratings_user = train_data.loc[train_data['user_id'] == userID] ratings_user.reset_index(inplace=True, drop=True) # select features of corresponding movies and convert to array features_user = np.array(features.loc[ratings_user['imdbID']]) features_movie = np.array(features.loc[imdbID]) pred_content.append( predict_movie_rating(ratings_user, features_user, features_movie)) # store the number of predictions of a user: no_of_ratings.append(ratings_user.shape[0]) # predictions of the models predictions = weighted_prediction(preds[0], preds[1], pred_content, no_of_ratings) test_data_with_rating = test_data_imdb.join(predictions) return test_data_with_rating[['user_id', 'movieID', 'rating']]
reader = Reader(rating_scale=(0, 5)) data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader) data2 = Dataset.load_from_df(ratingsByUser[['timestamp', 'movieId', 'rating']], reader) data3 = Dataset.load_from_df(ratingsByMovie[['timestamp', 'userId', 'rating']], reader) PMF = SVD() kval = 5 knn = KNNWithMeans(k=kval, min_k=kval, verbose=False) cross_validate(PMF, data1, measures=['MAE', 'RMSE'], cv=5, verbose=True) cross_validate(knn, data2, measures=['MAE', 'RMSE'], cv=3, verbose=True) cross_validate(knn, data3, measures=['MAE', 'RMSE'], cv=5, verbose=True) # knn1 = KNNWithMeans(k=2, min_k=1, verbose=False) # # knn2 = KNNWithMeans(k=5, min_k=5, verbose=False) # # knn3 = KNNWithMeans(k=9, min_k=9, verbose=False) # # knn4 = KNNWithMeans(k=15, min_k=15, verbose=False) #
def collaborative_filtering_using_surprise(): """ https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4 Predict games for user with user_key = 93681 """ target_user_key = 93681 # import reduced dataset: df = import_reduced_reviews() # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) # check out our user: df_target_user = df[df['user_key'] == target_user_key] # build utility matrix: # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating') # calculate sparsity # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size # print('Sparcity of utility matrix: ' + str(sparsity)) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Split in trainset and testset trainset, testset = train_test_split(data, test_size=0.2) print('Number of users: ', trainset.n_users, '\n') print('Number of items: ', trainset.n_items, '\n') # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You # might need to trace back to the original names. Using the items as an example (you can do the same approach # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back. # An example on how to save a list of inner and raw item id’s: trainset_iids = list(trainset.all_items()) iid_converter = lambda x: trainset.to_raw_iid(x) trainset_raw_iids = list(map(iid_converter, trainset_iids)) ## Model parameters: of kNN: # Two hyperparameters we can tune: # 1. k parameter # 2. similarity option # a) user-user vs item-item # b) similarity function (cosine, pearson, msd) sim_option = {'name': 'pearson', 'user_based': False} # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore k = 40 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) algo.fit(trainset) ## Testing: predictions = algo.test(testset) accuracy.rmse(predictions) # Own similarity matrix: sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() algo.sim = sim_matrix_imported predictions = algo.test(testset) accuracy.rmse(predictions) # Cross validation: skip = True if not skip: results = cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, return_train_measures=True) results_mean = results['test_rmse'].mean() ## Predictions # Lets assume we are happy with the method and now want to apply it to the entire data set. # Estimate for a specific user a specific item: single_item_single_user_prediction = algo.predict(uid=target_user_key, iid=100010, verbose=True) # Estimate all items for a specific user: list_of_all_items = trainset_raw_iids target_predictions = [] for item in list_of_all_items: single_prediction = algo.predict(uid=target_user_key, iid=item) target_predictions.append( (single_prediction.uid, single_prediction.iid, single_prediction.est)) # Then sort the predictions for each user and retrieve the k highest ones: target_predictions.sort(key=lambda x: x[2], reverse=True) n = 20 top_n = target_predictions[:n] top_n = [row[1] for row in top_n] print('end')