def knn_compute_prec_rec(t): precision, recall = [], [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): knn = KNNWithMeans(k=knn_best_k, sim_options={'name': 'pearson'}, verbose=False) knn.fit(trainset) trimmed_testset = trim_unpopular_user(testset, t, threshold) pred = knn.test(trimmed_testset) precision_dict, recall_dict = calculate_precision_recall( pred, t, threshold) precision.append(np.mean([prec for prec in precision_dict.values()])) recall.append(np.mean([rec for rec in recall_dict.values()])) return np.mean(precision), np.mean(recall)
def Q10(): data = load_data() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } meanRMSE, meanMAE = [], [] start = time.time() for k in range(2, 102, 2): knnWithMeans = KNNWithMeans(k, sim_options=sim_options) out = cross_validate(knnWithMeans, data, measures=['RMSE', 'MAE'], cv=10) meanRMSE.append(np.mean(out['test_rmse'])) meanMAE.append(np.mean(out['test_mae'])) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) print("Total time used for cross validation: " + cv_time) k = list(range(2, 102, 2)) ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']] make_plot(k, ys, 'Number of Neighbors', 'Error') return meanRMSE, meanMAE
def slot_select_algo_combobox(self): self.algo_change_flag=True self.algo_trained_flag=False algo_name=self.select_algo_comboBox.currentText() if algo_name=='SVD': self.algo=SVD() self.display_process_label.append('加载SVD模型...') elif algo_name=='SVD++': self.algo = SVDpp() self.display_process_label.append('加载SVD++模型...') elif algo_name == 'NMF': self.algo = NMF() self.display_process_label.append('加载NMF模型...') elif algo_name == 'Slope One': self.algo = SlopeOne() self.display_process_label.append('加载Slope One模型...') elif algo_name == 'k-NN': self.algo = KNNBasic() self.display_process_label.append('加载k-NN模型...') elif algo_name == 'Centered k-NN': self.algo = KNNWithMeans() self.display_process_label.append('加载Centered k-NN模型...') elif algo_name == 'k-NN Baseline': self.algo = KNNBaseline() self.display_process_label.append('加载k-NN Baseline模型...') elif algo_name == 'Co-Clustering': self.algo = CoClustering() self.display_process_label.append('加载Co-Clustering模型...') elif algo_name == 'Baseline': self.algo = BaselineOnly() self.display_process_label.append('加载Baseline模型...') elif algo_name == 'Random': self.algo = NormalPredictor() self.display_process_label.append('加载Random模型...')
def main(args=None): location = process_args(args) out_path = os.path.expanduser(location) print('Checking output directory...') if not os.path.exists(out_path): os.makedirs(out_path) else: ans = input("Overwrite output directory?: ").upper() if ans == 'N' or ans == 'NO': print('Exiting...') exit() print("Loading dataset...") data = Dataset.load_builtin('ml-1m') algo = SVD() print("Running SVD...") result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) write_results_to_file(result['test_rmse'], result['test_mae'], 'svd_out.json') print("Running KNN...") algo = KNNWithMeans() result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) write_results_to_file(result['test_rmse'], result['test_mae'], 'knn_out.json') print("Done.")
def knn_compute_cross_validation_error(k, random_state): knn = KNNWithMeans(k=k, sim_options={'name': 'pearson'}, verbose=False) cv = cross_validate(knn, R, cv=KFold(n_splits=10, random_state=random_state)) print('k: %s | RMSE: %f | MAE: %f' % (k, np.mean(cv['test_rmse']), np.mean(cv['test_mae']))) return np.mean(cv['test_rmse']), np.mean(cv['test_mae'])
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]): range = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) if qNum == 15: model = KNNWithMeans(bestK, sim_options=sim_options) elif qNum == 22: model = NMF(n_factors=bestK) else: model = SVD(n_factors=bestK) model.fit(trainset) pred = model.test(testset) for thrs in thres: np_true = np.array([]) np_score = np.array([]) for u, i, t, p, d in pred: if t >= thrs: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / range) title = 'Threshold ' + str(thrs) plot_ROC(np_true, np_score, title=title)
def user_based(data, db): # user-based collaborative filtering: recommend the # top n items based on similar users param_grid = { 'k': [20, 25, 30, 35, 40], 'min_k': [1], 'sim_options': { 'name': ['msd'], 'user_based': [True], 'min_support': [1] } } gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=4) gs.fit(data) best_rmse = gs.best_score['rmse'] best_params = gs.best_params['rmse'] print(best_rmse) print(best_params) k = best_params['k'] m = best_params['min_k'] n = best_params['sim_options']['name'] u = best_params['sim_options']['user_based'] s = best_params['sim_options']['min_support'] so = {'name': n, 'user_based': u, 'min_support': s} trainset = data.build_full_trainset() algo = KNNWithMeans(k=k, min_k=m, sim_options=so) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) # get top n predictions, in order top_n = get_top_n(predictions, n=10) # insert into database db.userRecs.drop() for uid, user_ratings in top_n.items(): recs = [iid for (iid, _) in user_ratings] rec = {'user_id': uid, 'recs': recs, 'timestamp': datetime.utcnow()} result = db.userRecs.insert_one(rec) print('done')
def knn_evaluate_trim_performance(trimming, k, random_state): knn = KNNWithMeans(k=k, min_k=1, sim_options={'name': 'pearson'}, verbose=False) rmse = [] for trainset, testset in KFold(n_splits=10, random_state=random_state).split(R): knn.fit(trainset) if trimming == 'popular': trimmed_testset = popular_trimming(testset, frequency) elif trimming == 'unpopular': trimmed_testset = unpopular_trimming(testset, frequency) elif trimming == 'high variance': trimmed_testset = high_variance_trimming(testset, frequency, variance) pred = knn.test(trimmed_testset) rmse.append(accuracy.rmse(pred, verbose=False)) print('k: %s | RMSE: %f' % (k, np.mean(rmse))) return np.mean(rmse)
def train_knn(data): rmse = [] mae = [] sim_options = {'name': 'pearson'} for k in range(2, 102, 2): print("using k = %d" % k) knn = KNNWithMeans(k=k, sim_options=sim_options) temp = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=10) rmse.append(np.mean(temp['test_rmse'])) mae.append(np.mean(temp['test_mae'])) print("k-fold validation finished!") return (rmse, mae)
def train_trim_knn(data, R): kfold = KFold(n_splits=10) sim_options = {'name': 'pearson'} rmse_list = [[], [], []] for k in range(2, 102, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] knn = KNNWithMeans(k=k, sim_options=sim_options) for trainset, testset in kfold.split(data): knn.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = knn.test(p_testset) u_pred = knn.test(u_testset) hv_pred = knn.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("KNN with trim is finished!!") return rmse_list
def Q34(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / rang) fpr, tpr, thresholds = roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area): fpr = fp[f] tpr = tp[t] # label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)' plt.plot(fpr, tpr, lw=lw, label='%s ROC curve (area = %0.2f)' % (mod, roc_auc)) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc="lower right") plt.show() plt.close()
def item_based(data, db): # content-based recommendations: recommend the # top n items similar to the current item param_grid = { 'k': [20, 30, 40, 50], 'min_k': [1, 5, 10], 'sim_options': { 'name': ['msd'], 'user_based': [False], 'min_support': [1] } } gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=4) gs.fit(data) best_rmse = gs.best_score['rmse'] best_params = gs.best_params['rmse'] print(best_rmse) print(best_params) k = best_params['k'] m = best_params['min_k'] n = best_params['sim_options']['name'] u = best_params['sim_options']['user_based'] s = best_params['sim_options']['min_support'] so = {'name': n, 'user_based': u, 'min_support': s} trainset = data.build_full_trainset() algo = KNNWithMeans(k=k, min_k=m, sim_options=so) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) # get top n predictions, in order top_n = get_top_n(predictions, n=10) # insert into database for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings])
def knn_cv(data): ''' Calculate root mean square error using k nearest neighbor method with k starting from 2 to 50 in step sizes of 2 10-folds cross-validation ''' rmse = [] k_list = range(2, 51, 2) print('Performing knn...') for k in k_list: print('k =', k) sim_options = {'name': 'cosine'} algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False) cv_result = cross_validate(algo, data, measures=['RMSE'], cv=10, verbose=False) rmse.append(np.mean(cv_result['test_rmse'])) print('Completed!') return rmse, k_list
def Q12To14And19To21And26To28(qNum, maxk=None): data = load_data() kf = KFold(n_splits=10) if maxk is None: if 12 <= qNum <= 14: maxk = 100 elif 19 <= qNum <= 21: maxk = 50 elif 26 <= qNum <= 28: maxk = 50 pop, unpop, highVar = classifyMovies() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } trimAndModel = { 12: (pop, 'KNNWithMeans'), 13: (unpop, 'KNNWithMeans'), 14: (highVar, 'KNNWithMeans'), 19: (pop, 'NMF'), 20: (unpop, 'NMF'), 21: (highVar, 'NMF'), 26: (pop, 'SVD'), 27: (unpop, 'SVD'), 28: (highVar, 'SVD') } RMSE = [] # RMSE for each k for k in range(2, maxk + 1, 2): # inclusive print('-' * 20 + ' k = ' + str(k) + ' ' + '-' * 20) trimSet, modelName = trimAndModel[qNum] if modelName == 'KNNWithMeans': model = KNNWithMeans(k, sim_options=sim_options) elif modelName == 'NMF': model = NMF(n_factors=k) else: model = SVD(n_factors=k) subRMSE = [] # RMSE for each k for each train-test split iter = 1 for trainSet, testSet in kf.split(data): subsubRMSE = 0 model.fit(trainSet) testSet = list(filter(lambda x: x[1] in trimSet, testSet)) nTest = len(testSet) print("Split " + str(iter) + ": test set size after trimming: %d", nTest) iter += 1 predictions = model.test(testSet) for p in predictions: subsubRMSE += pow(p.est - p.r_ui, 2) # calculate RMSE of this train-test split subRMSE.append(np.sqrt(subsubRMSE / nTest)) # average of all train-test splits of k-NN for this k RMSE.append(np.mean(subRMSE)) # plotting k = list(range(2, maxk + 1, 2)) ys = [[RMSE, 'RMSE']] xTitle = 'Number of Neighbors' if qNum <= 14 else 'Number of latent factors' make_plot(k, ys, xTitle, 'Error') return RMSE
from surprise.model_selection import cross_validate from surprise.prediction_algorithms.knns import KNNWithMeans reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('ratings.csv', reader=reader) # Calculate root mean square error using k nearest neighbor method # with k starting from 2 to 50 in step sizes of 2 rmse_train = [] rmse_test = [] for k in range(2, 51, 2): print('k =', k) sim_options = {'name': 'cosine'} algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False) result = cross_validate(algo, data, measures=['RMSE'], cv=10, return_train_measures=True, verbose=False) rmse_train.append(np.mean(result['train_rmse'])) rmse_test.append(np.mean(result['test_rmse'])) plt.figure(1) plt.plot(range(2, 51, 2), rmse_train) plt.plot(range(2, 51, 2), rmse_test) plt.xlabel('k') plt.ylabel('Root Mean Square Error') plt.title('kNN: The Result of Average RMSE versus k')
from surprise import AlgoBase from surprise.model_selection import cross_validate from surprise.model_selection.split import train_test_split import matplotlib.pyplot as plt from surprise.prediction_algorithms.knns import KNNWithMeans from surprise.prediction_algorithms.matrix_factorization import NMF from surprise.prediction_algorithms.matrix_factorization import SVD plt.close('all') reader = Reader(sep=',') data = Dataset.load_from_file('./ml-latest-small/ratings_new.csv', reader=reader) data.split(n_folds=10) sim_options = {'name': 'pearson', 'user_based': True} algo1 = KNNWithMeans(k=48, sim_options=sim_options) algo2 = NMF(n_factors=16) algo3 = SVD(n_factors=14) def RankSweep(algo, tit, num): t_all = range(1, 26) pre_all = np.zeros(25) rec_all = np.zeros(25) for trainset, testset in data.folds(): algo.fit(trainset) pred = algo.test(testset) G_all = dict() S_all = dict() for elem in pred: if elem.r_ui >= 3:
reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) neighbors = np.linspace(1,101,num=51,dtype=int) basic_pearson, basic_cosine = [], [] for i in neighbors: print(i) cv_pearson = cross_validate(KNNBasic(k=i,sim_options={'name':'pearson'},verbose=False), data, cv=5) basic_pearson.append(np.mean(cv_pearson['test_rmse'])) cv_cosine = cross_validate(KNNBasic(k=i,sim_options={'name':'cosine'},verbose=False), data, cv=5) basic_cosine.append(np.mean(cv_cosine['test_rmse'])) means_pearson, means_cosine = [], [] for i in neighbors: print(i) cv_pearson = cross_validate(KNNWithMeans(k=i,sim_options={'name':'pearson'},verbose=False), data, cv=5) means_pearson.append(np.mean(cv_pearson['test_rmse'])) cv_cosine = cross_validate(KNNWithMeans(k=i,sim_options={'name':'cosine'},verbose=False), data, cv=5) means_cosine.append(np.mean(cv_cosine['test_rmse'])) fig, ax = plt.subplots() ax.plot(neighbors,basic_cosine, 'r', label='Cosine') ax.plot(neighbors, basic_pearson, 'b', label='Pearson') ax.legend(loc='best') plt.xlabel("k"); plt.ylabel("5-fold average RMSE"); plt.title("k-NN with 5-fold CV") fig, ax = plt.subplots() ax.plot(neighbors,means_cosine, 'r', label='Cosine') ax.plot(neighbors, means_pearson, 'b', label='Pearson') ax.legend(loc='best') plt.xlabel("k"); plt.ylabel("5-fold average RMSE"); plt.title("Mean-centered k-NN with 5-fold CV")
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('../dataset/ratings.csv', reader=reader) # 10-fold cross validation rmse, k_list = knn_cv(data) # get optimal k min_idx = rmse.index(min(rmse)) k_hat = k_list[min_idx] # Training trainset, testset = train_test_split(data, test_size=0.1) sim_options = {'name': 'cosine'} algo = KNNWithMeans(k=k_hat, sim_options=sim_options, verbose=False) algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions) # Plot Testing rmse plt.figure(1) plt.plot(k_list, rmse) plt.xlabel('k') plt.ylabel('Testing Root Mean Square Error') plt.title('kNN: The Result of Average RMSE versus k') plt.show() # Plot ROC curve test_target = [] test_score = []
import pandas as pd from scipy import stats from surprise.prediction_algorithms.knns import KNNWithMeans from surprise import Dataset from surprise.model_selection import KFold from surprise import accuracy import matplotlib.pyplot as plt # Load the movielens-100k dataset data = Dataset.load_builtin('ml-100k') sim_itembase = { 'name': 'cosine', 'user_based': False } # compute similarities between items algo_itembase = KNNWithMeans(sim_options=sim_itembase) sim_userbase = { 'name': 'pearson_baseline' } # compute similarities between users algo_userbase = KNNWithMeans(sim_options=sim_userbase) # Run 5-fold cross-validation and save results. kf = KFold(n_splits=5) rmse_df = pd.DataFrame(columns=['Item-based', 'User-based']) for trainset, testset in kf.split(data): # train and test algorithm. algo_itembase.fit(trainset) pred_itembase = algo_itembase.test(testset)
scaled_data = convert_df_to_data(scaled_df, scaled_reader) scaled_data.split(n_folds=5) data = convert_df_to_data(df, reader) data.split(n_folds=5) # plot some EDA figures: plot_average_rating_hist(df) # Cross Valdiation Tests for different Classification Models: models = [] models.append(('GM', GlobalMean())) models.append(('MoM', MeanofMeans())) models.append(('BLO', BaselineOnly())) models.append(('KNNb', KNNBasic())) models.append(('KNNwm', KNNWithMeans())) models.append(('KNNbl', KNNBaseline())) models.append(('SVD', SVD())) models.append(('NMF', NMF())) models.append(('SO', SlopeOne())) models.append(('CoC', CoClustering())) # plotting box plot of cross validation scores for array of recommendation models on scaled ratings data: model_names, rmses, maes = crossval_scores(scaled_data, models[:-1]) # Now to find out which recommendation model has the lowest amount of false positives (recommending a movie that a user wounldn't like) and false negatives (failing to recommend a movie that a user would like). We'll choose a model based on the f1 score. model_names, fps, fns, tps, tns, precisions, recalls, f1s = get_fpfns(scaled_data, models, thresh=0.5) # Highest F1 score was the SVD model. We'll go with this model build a recommender system. '''To make a business case we'll have to make some assuptions about the costs and benefits that Movies-Legit service experiences when giving users recommendations they like (True Positive) and giving users recommendations they don't like (False Positives).
return pre, rec #read data path = '/users/ht/desktop/EE219/proj_3/' reader = Reader(line_format='user item rating timestamp', sep=',') data_raw = Dataset.load_from_file(path + 'data/ratings_1.csv', reader=reader) #define K-fold num_fold = 10 kf = split.KFold(n_splits=num_fold) #define model for training k_min = 24 sim_options = {'name': 'pearson', 'user_based': True} knn = KNNWithMeans(k=k_min, sim_options=sim_options) #train, test and rank top_t_list = range(1, 26) pre_list_knn = [] rec_list_knn = [] for top_t in top_t_list: pre = 0 rec = 0 for trainset, testset in kf.split(data_raw): knn.fit(trainset) prediction = knn.test(testset) G = create_dict(testset) G_s = create_dict(prediction, if_pred=1) R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t) #precision and recall for each fold
temp_prec.append(fold_mean_prec) temp_recall.append(fold_mean_recall) t_mean_prec = sum(prec for prec in temp_prec) / len(temp_prec) t_mean_recall = sum(rec for rec in temp_recall) / len(temp_recall) precision.append(t_mean_prec) recall.append(t_mean_recall) return ts, precision, recall # read in data file_path = os.path.expanduser('ratings.csv') reader = Reader(line_format='user item rating', sep=',',skip_lines=1, rating_scale=(0.5, 5)) data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson'} knn = KNNWithMeans(k=24, sim_options=sim_options) nmf = NMF(n_factors=4) nmfBiased = NMF(n_factors=2, biased=True) algs = [] algs.append(knn) algs.append(nmf) algs.append(nmfBiased) names = {} names[knn] = "KNN" names[nmf] = "NNMF" names[nmfBiased] = "NMF(biased)" res_t_p_r = {} for alg in algs:
plt.title('Distribution of ratings among users') plt.ylabel('Number of ratings') plt.xlabel('Users') #Question 6 var = ratings.groupby('movieId')['rating'].var().fillna(0).tolist() plt.hist(var, bins=np.arange(0, 11, 0.5)) plt.xlabel('Variance of ratings') plt.ylabel('Number of movies') plt.title('Distribution of variance of ratings') #Question 10 k_range = range(2, 100, 2) avg_rmse, avg_mae = [], [] for k in k_range: algo = KNNWithMeans(k=k, sim_options={'name': 'pearson'}) cv_results = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=False) avg_rmse.append(np.mean(cv_results['test_rmse'])) avg_mae.append(np.mean(cv_results['test_mae'])) plt.plot(k_range, avg_rmse, label="Average RMSE") plt.plot(k_range, avg_mae, label="Average MAE") plt.xlabel('Number of neighbors') plt.ylabel('Error') plt.legend() plt.show()
def Q36To38(qNum): print("problem ", qNum) data = load_data() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } filter = { 36: 'KNNWithMeans', 37: 'NMF', 38: 'SVD', } k_KNNWithMeans = 30 # from Q11 k_NMF = 18 # from Q18 k_SVD = 8 # from Q25 modelName = filter[qNum] if modelName == 'KNNWithMeans': model = KNNWithMeans(k_KNNWithMeans, sim_options=sim_options) elif modelName == 'NMF': model = NMF(n_factors=k_NMF) else: model = SVD(n_factors=k_SVD) # sweep t from 1 to 25 precision_arr = [] recall_arr = [] for t in range(1, 26): kf = KFold(n_splits=10) for trainSet, testSet in kf.split(data): sub_precisions = 0.0 sub_recalls = 0.0 model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall(predictions, t) print(sum(prec for prec in precisions.values()) / len(precisions)) sub_precisions += (sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls)) sub_recalls += (sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(sub_precisions)) recall_arr.append(np.mean(sub_recalls)) t_list = list(range(1, 26)) ys = [[precision_arr, 'mean precisions'], [recall_arr, 'mean recalls']] print("model name: ", modelName) # make_plot(t_list, ys, 'recommended item size t','Precision') # precision vs t title_ = "precision vs t for: " + modelName make_plot(t_list, [[precision_arr, 'mean precisions']], 'recommended item size t', 'Precision', title=title_) # recall vs t title_ = "recall vs t for: " + modelName make_plot(t_list, [[recall_arr, 'mean recalls']], 'recommended item size t', 'Recall', title=title_) # precision vs recall title_ = "precision vs recall for: " + modelName #make_plot([recall_arr, 'mean recalls'], [[precision_arr, 'mean precisions']], 'Recall','Precision', title = title_) plt.plot(recall_arr, precision_arr, label=modelName) xlabel = "recall" ylabel = "precision" plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend() plt.grid() plt.title(title_) plt.show() return precision_arr, recall_arr
# Q10 # In order to fit surprise file_path = os.path.expanduser('ratings.csv') reader = Reader(line_format='user item rating', sep=',', skip_lines=1, rating_scale=(0.5, 5)) data = Dataset.load_from_file(file_path, reader=reader) acc_cv = np.zeros((2, 50)) sim_options = {'name': 'pearson'} i = 0 for k in range(2, 101, 2): algo = KNNWithMeans(k=k, sim_options=sim_options) cv1 = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=False) acc_cv[0, i] = np.mean(cv1['test_rmse']) acc_cv[1, i] = np.mean(cv1['test_mae']) print('test_rmse = %f, test_mae = %f' % (acc_cv[0, i], acc_cv[1, i])) i = i + 1 pass ks = np.arange(2, 101, 2) plt.xlabel('k') plt.ylabel('Error value') plt.title('Test RMSE and MAE vs k in KNN with 10 Validation')
label='Threshold: %.1f, AUC: %.4f' % (threshold, auc_score), linewidth=2) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.legend(loc='lower right') plt.title('ROC Curves for {}-based Collaborative Filter'.format(method), fontweight="bold") plt.show() # In[29]: trainset, testset = train_test_split(R, test_size=0.1, random_state=42) knn_best = KNNWithMeans(k=knn_best_k, sim_options={'name': 'pearson'}, verbose=False) knn_best.fit(trainset) knn_best_pred = knn_best.test(testset) plot_roc_curves(testset, knn_best_pred, 'KNN') # # PART 2 - Model-based Collaborative Filtering # ## Non-Negative Matrix Factorization # <font size=4>**Question 17:** Design a NNMF-based collaborative filter to predict the ratings of the movies in the MovieLens dataset and evaluate it’s performance using 10-fold cross-validation. Sweep k (number of latent factors) from 2 to 50 in step sizes of 2, and for each k compute the average RMSE and average MAE obtained by averaging the RMSE and MAE across all 10 folds. Plot the average RMSE (Y-axis) against k (X-axis) and the average MAE (Y-axis) against k (X-axis). For solving this question, use the default value for the regularization parameter.</font> # In[30]: import numpy as np
file_path = os.path.expanduser('ml-latest-small/ratings_unpopular.csv') reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) # data = Dataset.load_builtin('ml-100k') sim_options = {'name': 'pearson', 'user_based': True} avg_rmse = [] avg_mae = [] all_k = [] for i in range(2, 102, 2): print('k = ', i) all_k.append(i) algo = KNNWithMeans(k=i, sim_options=sim_options) output = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True, n_jobs=1) avg_rmse.append(np.mean(output['test_rmse'])) avg_mae.append(np.mean(output['test_mae'])) print("min rmse k:", avg_rmse.index(min(avg_rmse))) print("min rmse:", min(avg_rmse)) print("min mae k:", avg_mae.index(min(avg_mae))) print("min mae:", min(avg_mae)) plt.plot(all_k, avg_rmse)
RS_ratings = ratings.drop(columns='timestamp') RS_reader = Reader(name=None, line_format='user item rating', sep=',', rating_scale=(1, 5), skip_lines=0) RS_data = Dataset.load_from_df(RS_ratings, RS_reader) # Benchmark_Algorithm_Metric benchmark = [] for algorithm in [ BaselineOnly(), CoClustering(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), NMF(), NormalPredictor(), SlopeOne(), SVD(), SVDpp() ]: # Perform cross validation results = cross_validate(algorithm, RS_data, measures=['rmse', 'mae', 'mse', 'fcp'], cv=5, verbose=True) # Results To Serie List tmp = pd.DataFrame.from_dict(results).mean(axis=0)
plt.savefig('plot/q15_knn_roc_' + str(threshold) + '.png') plt.clf() if __name__ == "__main__": threshold = [2.5, 3, 3.5, 4] file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True} trainset, testset = train_test_split(data, test_size=0.1) for th in threshold: algo = KNNWithMeans(k=34, sim_options=sim_options) algo.fit(trainset) predictions = algo.test(testset) y_true = [] y_estimate = [] for row in predictions: if row[2] >= th: y_true.append(1) else: y_true.append(0) y_estimate.append(row[3]) plot_roc(y_true, y_estimate, th)
def get_top_t(predictions, t=10): # First map the predictions to each user. top_t = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_t[uid].append((iid, est, true_r)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_t.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_t[uid] = user_ratings[:t] return top_t train_set, test_set = train_test_split(data, test_size=0.1, random_state=0) algo = KNNWithMeans(k=20, sim_options={'name': 'pearson'}) algo.fit(train_set) predictions = algo.test(test_set) top_recos = get_top_t(predictions) def precision_recall_at_k(predictions, k=10, threshold=3.5): user_est_true = defaultdict(list) for uid, _, true_r, est, _ in predictions: user_est_true[uid].append((est, true_r)) precisions = dict() recalls = dict() for uid, user_ratings in user_est_true.items(): user_ratings.sort(key=lambda x: x[0], reverse=True) n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings) n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])