def get_results(setNum, reg_term): reader = Reader(rating_scale = (0,10)) train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';') # test = pd.read_csv('../data/test_update.csv', sep = ';') train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) # test_set = Dataset.load_from_df(test[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) data = train_set.build_full_trainset() num_factors = 50 if setNum==10: num_factors_b=200 if setNum==15: num_factors_b=400 if setNum==30: num_factors_b=600 svd = SVD(n_factors = num_factors, reg_all=reg_term) svd_bias = SVD(n_factors = num_factors_b, biased=True, reg_all=reg_term) baseline = DumbBaseline() cv_svd = cross_validate(svd, train_set, n_jobs = -2, return_train_measures=True) cv_svd_bias = cross_validate(svd_bias, train_set, n_jobs = -2, return_train_measures=True) cv_baseline = cross_validate(baseline, train_set, n_jobs = -2, return_train_measures=True) # getting the results ready to plot val_res = [np.mean(cv_svd['test_rmse']), np.mean(cv_svd_bias['test_rmse']),np.mean(cv_baseline['test_rmse'])] train_res = [np.mean(cv_svd['train_rmse']), np.mean(cv_svd_bias['train_rmse']),np.mean(cv_baseline['train_rmse'])] val_err = [np.std(cv_svd['test_rmse']), np.std(cv_svd_bias['test_rmse']),np.std(cv_baseline['test_rmse'])] train_err = [np.std(cv_svd['train_rmse']), np.std(cv_svd_bias['train_rmse']),np.std(cv_baseline['train_rmse'])] algs = ['MF (k='+str(num_factors)+')', 'MF With Bias (k='+str(num_factors_b)+')', 'Baseline'] return val_res,train_res,val_err,train_err,algs
def svd_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ pmf_ratings Predicates """ print("SVD predicates") svd_model = SVD() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) svd_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est write(predictions, 'svd_rating_obs', fold, phase)
def mfb_compute_high_var_trim_rmse(k): mfb = SVD(n_factors=k, random_state=42) rmse = [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): mfb.fit(trainset) testset_trimmed = high_variance_trimming(testset, frequency, variance) pred = mfb.test(testset_trimmed) rmse.append(accuracy.rmse(pred, verbose=False)) print('k: %s | RMSE: %f' % (k, np.mean(rmse))) return np.mean(rmse)
def svd_algorithm() -> SVD: user_input = input( 'Do you want to continue with the default parameters? Y/N') if user_input.lower() == 'y': return SVD() else: n_factors = int(input('Enter total number of factors: ')) n_epochs = int(input('Enter number of epochs: ')) lr_all = float( input('Enter the learning rate for all the paramaters: ')) return SVD(n_factors, n_epochs, lr_all)
def mfb_compute_prec_rec(t): precision, recall = [], [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): mfb = SVD(n_factors=mfb_best_k, random_state=42) mfb.fit(trainset) trimmed_testset = trim_unpopular_user(testset, t, threshold) pred = mfb.test(trimmed_testset) precision_dict, recall_dict = calculate_precision_recall( pred, t, threshold) precision.append(np.mean([prec for prec in precision_dict.values()])) recall.append(np.mean([rec for rec in recall_dict.values()])) return np.mean(precision), np.mean(recall)
def MF_trim_filter(ratings, dims, func, mv_dict): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 kf = KFold(n_splits=10, random_state=42) for k in range(len(dims)): svd = SVD(n_factors=dims[k], random_state=42) test_rmse = np.array([]) test_mae = np.array([]) for trainset, testset in kf.split(data): svd.fit(trainset) full_data = trainset.build_testset() + testset func(mv_dict, testset) pred = svd.test(testset) test_rmse = np.append(test_rmse, accuracy.rmse(pred, verbose=False)) test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False)) RMSE[k] = np.mean(test_rmse) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(test_mae) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] print('For k = %i :' % dims[k]) print('RMSE: ', RMSE[k]) print('MAE: ', MAE[k]) plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]): range = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) if qNum == 15: model = KNNWithMeans(bestK, sim_options=sim_options) elif qNum == 22: model = NMF(n_factors=bestK) else: model = SVD(n_factors=bestK) model.fit(trainset) pred = model.test(testset) for thrs in thres: np_true = np.array([]) np_score = np.array([]) for u, i, t, p, d in pred: if t >= thrs: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / range) title = 'Threshold ' + str(thrs) plot_ROC(np_true, np_score, title=title)
def slot_select_algo_combobox(self): self.algo_change_flag=True self.algo_trained_flag=False algo_name=self.select_algo_comboBox.currentText() if algo_name=='SVD': self.algo=SVD() self.display_process_label.append('加载SVD模型...') elif algo_name=='SVD++': self.algo = SVDpp() self.display_process_label.append('加载SVD++模型...') elif algo_name == 'NMF': self.algo = NMF() self.display_process_label.append('加载NMF模型...') elif algo_name == 'Slope One': self.algo = SlopeOne() self.display_process_label.append('加载Slope One模型...') elif algo_name == 'k-NN': self.algo = KNNBasic() self.display_process_label.append('加载k-NN模型...') elif algo_name == 'Centered k-NN': self.algo = KNNWithMeans() self.display_process_label.append('加载Centered k-NN模型...') elif algo_name == 'k-NN Baseline': self.algo = KNNBaseline() self.display_process_label.append('加载k-NN Baseline模型...') elif algo_name == 'Co-Clustering': self.algo = CoClustering() self.display_process_label.append('加载Co-Clustering模型...') elif algo_name == 'Baseline': self.algo = BaselineOnly() self.display_process_label.append('加载Baseline模型...') elif algo_name == 'Random': self.algo = NormalPredictor() self.display_process_label.append('加载Random模型...')
def recommand(self): sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } best_model = knns.KNNWithMeans(k=20, sim_options=sim_options) t_values, precisions_knn, recall_knn = self.test_with_t_and_k( best_model, msg='KNN') best_model = matrix_factorization.NMF(n_factors=20, biased=False) t_values, precisions_nmf, recall_nmf = self.test_with_t_and_k( best_model, msg='NMF') best_model = SVD(20) t_values, precisions_svd, recall_svd = self.test_with_t_and_k( best_model, msg='SVD') plt.plot(t_values, precisions_knn, label='precisions_knn') plt.plot(t_values, precisions_nmf, label='precisions_nmf') plt.plot(t_values, precisions_svd, label='precisions_svd') plt.plot(t_values, recall_knn, label='recall_knn') plt.plot(t_values, recall_nmf, label='recall_nmf') plt.plot(t_values, recall_svd, label='recall_svd') plt.xlabel('t_value') plt.ylabel('percent') plt.legend(loc="best") plt.show()
def Question24(data): ks = range(2, 51, 2) RMSE = [] MAE = [] for k in ks: model = SVD(n_factors=k) pred = cross_validate(model, data, cv=10) RMSE.append(np.mean(pred['test_rmse'])) MAE.append(np.mean(pred['test_mae'])) # Plot plt.plot(ks, RMSE) plt.xlabel('k') plt.ylabel('Average RMSE') plt.savefig('Q24_RMSE.png') plt.figure() plt.plot(ks, MAE) plt.xlabel('k') plt.ylabel('Average MAE') plt.savefig('Q24_MAE.png') index = np.argmin(RMSE) print("Best k: %i" % ks[index]) print("Lowest RMSE: %f" % RMSE[index]) print("Lowest MAE: %f" % np.min(MAE))
def build_model(train,method ='svd'): """Builds model and makes predictions for user-book rating. Args: train(surprise trainset): training set for the model to train on method (string): Method to use. Either 'knn' or 'svd'. Deafault is 'svd'. Returns: list of Prediction objects. """ if method == 'knn': surprise_sim_opt = {'name':'cosine','user_based':False} model = KNNBasic(k=100, min_k=20,sim_options = surprise_sim_opt) else: model = SVD(n_epochs=50) model.fit(train) test = train.build_anti_testset() pred = model.test(testset) return pred
def run_and_test_all_models(self): step_size = 2 # KNN sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } algo = knns.KNNWithMeans args = {'sim_options': sim_options} best_model = knns.KNNWithMeans(k=20, sim_options=sim_options) roc_auc_KNN = self.run_and_test_model(algo, args, best_model, (2, 101, step_size), 'KNN') # # NMF algo = matrix_factorization.NMF args = {'biased': False} best_model = matrix_factorization.NMF(n_factors=20, biased=False) roc_auc_NMF = self.run_and_test_model(algo, args, best_model, (2, 51, step_size), 'NMF') # SVD algo = matrix_factorization.SVD args = {} best_model = SVD(20) roc_auc_SVD = self.run_and_test_model(algo, args, best_model, (2, 51, step_size), 'SVD') # all for i in range(len(roc_auc_KNN)): plt.plot(roc_auc_KNN[i][0], roc_auc_KNN[i][1], color='blue', linewidth=2.0, label='KNN') plt.plot(roc_auc_NMF[i][0], roc_auc_NMF[i][1], color='blue', linewidth=2.0, label='NMF') plt.plot(roc_auc_SVD[i][0], roc_auc_SVD[i][1], color='blue', linewidth=2.0, label='SVD') plt.plot([0, 1], [0, 1], color='yellow', linewidth=2.0) plt.xlabel('FPR') plt.ylabel('TPR') plt.legend(loc="lower right") plt.show() # NaiveFilter self.run_naive_filter(msg='normal') self.run_naive_filter(test_filter=trimPopular, msg='trimPopular') self.run_naive_filter(test_filter=trimUnpopular, msg='trimUnpopular') self.run_naive_filter(test_filter=trimHighVariance, msg='trimHighVariance')
def rank_predictions(model_name): k_KNN = 22 k_NNMF = 20 k_MF = 26 if model_name == 'KNN': sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 } model = KNNWithMeans(k_KNN, sim_options=sim_options) elif model_name == 'NNMF': model = NMF(n_factors= k_NNMF) else: model = SVD(n_factors = k_MF) precision_arr = [] recall_arr = [] for t in range (1,26): kf = KFold(n_splits=10) print(t) p = [] r = [] for trainSet, testSet in kf.split(data): model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall (predictions, t) p.append(sum(prec for prec in precisions.values()) / len(precisions)) r.append(sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(np.array(p))) recall_arr.append(np.mean(np.array(r))) # precision vs t plt.plot(list(range (1,26)), precision_arr) plt.xlabel("Size") plt.ylabel("Precision") plt.title("The average precision plot using " + model_name) plt.show() # recall vs t plt.plot(list(range (1,26)), recall_arr) plt.xlabel("Size") plt.ylabel("Recall") plt.title("The average recall plot using MF " + model_name) plt.show() # precision vs recall plt.plot(recall_arr, precision_arr) plt.xlabel("Recall") plt.ylabel("Precision") plt.title("The average precision and recall plot using " + model_name) plt.show() return precision_arr, recall_arr
def train_svd(data): rmse = [] mae = [] sim_options = {'name': 'pearson'} for k in range(2, 52, 2): print("using k = %d" % k) nmf = SVD(n_factors=k) temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10) rmse.append(np.mean(temp['test_rmse'])) mae.append(np.mean(temp['test_mae'])) print("k-fold validation finished!") return (rmse, mae)
def train_trim_svd(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] svd = SVD(n_factors=k) for trainset, testset in kfold.split(data): svd.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = svd.test(p_testset) u_pred = svd.test(u_testset) hv_pred = svd.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("SVD with trim is finished!!") return rmse_list
def Q34(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / rang) fpr, tpr, thresholds = roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area): fpr = fp[f] tpr = tp[t] # label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)' plt.plot(fpr, tpr, lw=lw, label='%s ROC curve (area = %0.2f)' % (mod, roc_auc)) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc="lower right") plt.show() plt.close()
def MF_bin_pre(ratings, ts, nmf_fac, thrd): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=ts) algo = SVD(n_factors=nmf_fac, random_state=42) algo.fit(trainset) pre = algo.test(testset) true_rating = np.empty(len(pre)) pred_rating = np.empty(len(pre)) for i in range(len(pre)): true_rating[i] = pre[i][2] pred_rating[i] = pre[i][3] bi_rating = np.empty(len(pre)) one_idx = true_rating >= thrd zero_idx = true_rating < thrd bi_rating[one_idx] = 1.0 bi_rating[zero_idx] = 0.0 return bi_rating, pred_rating
def trimmed_test_MF(data, choice=0): ks = range(2, 51, 2) avg_RMSEs = [] for k in ks: kf = KFold(n_splits=10) rmse_total = 0 for trainset, testset in kf.split(data): trimmed_testset = trim(data, testset, choice) model = SVD(n_factors=k).fit(trainset) pred = model.test(trimmed_testset) rmse_total += rmse(pred, verbose=False) rmse_total = rmse_total / 10.0 avg_RMSEs.append(rmse_total) # Plot plt.plot(ks, avg_RMSEs) plt.xlabel('k') plt.ylabel('Average RMSE') plt.savefig('RMSE_' + str(choice) + '.png') index = np.argmin(avg_RMSEs) print("Best k: %i" % ks[index]) print("Lowest RMSE: %f" % avg_RMSEs[index])
def vary_factors(setNum, n_factors): reader = Reader(rating_scale = (0,10)) train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';') train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) data = train_set.build_full_trainset() train_errors = [] val_errors = [] for f in n_factors: svd = SVD(n_factors = f) cv = cross_validate(svd, train_set, return_train_measures=True, n_jobs = -2, verbose=True) train_errors += [np.mean(cv['train_rmse'])] val_errors += [np.mean(cv['test_rmse'])] return train_errors, val_errors
def __init__(self): super(Window,self).__init__() self.setupUi(self) self.connect_slot_function() self.current_path = os.getcwd() self.dataset_path='./dataset/data.csv' self.result_path = './result/pre_result.txt' self.help_file_path='./help/help.txt' self.max_totalnum=10000 self.cut_num=0 self.algo = SVD() self.display_process_label.append('初始化加载SVD模型.') self.algo_change_flag = False self.algo_trained_flag=False self.init_dir()
def Q26To28(qNum, n_splits=10): data = load_data() kf = KFold(n_splits=10) trimFun = {26: popularTrim, 27: unpopularTrim, 28: highVarTrim} RMSE = [] for k in range(2, 52, 2): MF_svd = SVD(n_factors=k) subRMSE = [] for trainSet, testSet in kf.split(data): subsubRMSE = 0 MF_svd.fit(trainSet) testSet = trimFun[qNum](testSet) nTest = len(testSet) print("test set size after trimming: %d", nTest) for (r, c, rating) in testSet: predictedRating = MF_svd.predict(str(r), str(c)) subsubRMSE += (pow(rating - predictedRating.est, 2)) # calculate RMSE of this train-test split subRMSE.append(np.sqrt(subsubRMSE / nTest)) # average of all train-test splits of k-NN RMSE.append(np.mean(subRMSE)) return RMSE
def run_svd(data, params, svdpp=False): '''Returns trained SVD model based on matrix factorization''' if svdpp: alg = SVDpp(n_factors=utils.get_param(params, 'n_factors'), n_epochs=utils.get_param(params, 'n_epochs'), lr_all=utils.get_param(params, 'learning_rate'), reg_all=utils.get_param(params, 'reg'), verbose=True) else: alg = SVD(biased=utils.get_param(params, 'biased'), n_factors=utils.get_param(params, 'n_factors'), n_epochs=utils.get_param(params, 'n_epochs'), lr_all=utils.get_param(params, 'learning_rate'), reg_all=utils.get_param(params, 'reg'), verbose=True) alg.fit(data) return alg
def plot_all_ROC(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p/rang) fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = metrics.auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area): fpr = fp[f] tpr = tp[t] plt.plot(fpr, tpr, lw=lw, label='%s'%mod) plt.plot([0, 1], [0, 1], lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend() plt.show()
def grid_search(surprise_model): if type(surprise_model()) == type(SVDpp()): param_grid = {'n_factors':[20] , 'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(SVD()): param_grid = {'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(NMF()): param_grid = {'n_epochs':[20], 'reg_pu':[0.02, 0.04, 0.06, 0.08, 0.2], 'reg_qi':[0.02, 0.04, 0.06, 0.08, 0.2]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(BaselineOnly()): param_grid = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2], 'learning_rate': [0.005, 0.05, 0.5, 1.0]}} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) return gs
def MF_bias_filter(ratings, dims): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 for k in range(len(dims)): svd = SVD(n_factors=dims[k], random_state=42) cv = cross_validate(algo=svd, data=data, measures=['RMSE', 'MAE'], cv=10, verbose=True) RMSE[k] = np.mean(cv['test_rmse']) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(cv['test_mae']) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def Q24(): # so far using same code as Q10, Q12-14 for Q24, Q26-28, can combine code later # only using SVD for Q24 for now, but the RMSE and MAE don't change much with latent factor data = load_data() meanRMSE, meanMAE = [], [] start = time.time() for k in range(2, 52, 2): MF_svd = SVD(n_factors=k) out = cross_validate(MF_svd, data, measures=['RMSE', 'MAE'], cv=10) meanRMSE.append(np.mean(out['test_rmse'])) meanMAE.append(np.mean(out['test_mae'])) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) print("Total time used for cross validation: " + cv_time) k = list(range(2, 52, 2)) ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']] #currently plot meanRMSE and meanMAE separately because it's hard to see the trend when they are plotted in same graph make_plot(k, [[meanRMSE, 'mean RMSE']], 'Number of Neighbors', 'Error') make_plot(k, [[meanMAE, 'mean MAE']], 'Number of Neighbors', 'Error') return meanRMSE, meanMAE
import matplotlib.pyplot as plt import numpy as np file_path = os.path.expanduser('ml-latest-small/ratings_new.csv') reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) avg_rmse = [] avg_mae = [] all_k = [] for i in range(2,52,2): print('k = ',i) all_k.append(i) mf = SVD(n_factors=i) output = cross_validate(mf, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) avg_rmse.append(np.mean(output['test_rmse'])) avg_mae.append(np.mean(output['test_mae'])) print("min rmse k:", avg_rmse.index(min(avg_rmse))) print("min mae k:", avg_mae.index(min(avg_mae))) plt.plot(all_k,avg_rmse) plt.savefig('plot/mf_rmse_k.png') plt.clf() plt.plot(all_k,avg_mae) plt.savefig('plot/mf_mae_k.png') plt.clf()
from surprise import Dataset, evaluate, Reader, KNNBasic from surprise.model_selection import cross_validate from surprise.prediction_algorithms.matrix_factorization import SVD from DataProcessing.dataprocessing import ratings print("Training SVD Algorithm") reader = Reader() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) svd = SVD() # print(cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)) trainset = data.build_full_trainset() svd.fit(trainset) def recColl(userid, movieid, gt=None): return svd.predict(userid, movieid, gt) # print(recColl(1,862,3))
sim_options = {'name': 'pearson', 'user_based': True } trainset, testset = train_test_split(data, test_size=0.1) algo = KNNWithMeans(k=34, sim_options=sim_options) algo.fit(trainset) predictions1 = algo.test(testset) algo = NMF(n_factors=16) algo.fit(trainset) predictions2 = algo.test(testset) algo = SVD(n_factors=14) algo.fit(trainset) predictions3 = algo.test(testset) y_true = [] y_estimate1 = [] y_estimate2 = [] y_estimate3 = [] for row in predictions1: if row[2] >= threshold: y_true.append(1) else: y_true.append(0) for row in predictions1:
algo.fit(trainset) # print testset predictions = algo.test(testset) Prec, Reca = metrics(predictions, t) pr = pr + Prec re = re + Reca return pr / 10.0, re / 10.0 if __name__ == '__main__': data = retrieve_data() G_max = ret_mod_user_dict(data) algo_NMF = NMF(NMF_no_of_LF, verbose=False) algo_SVD = SVD(n_factors=MF_no_of_LF) algo_KNN = KNNWithMeans(k=KNN_no_of_LF, sim_options=sim_options, verbose=False) # Q36 Pr1 = [] Re1 = [] t = list(range(1, 26)) for l in t: Precision, Recall = cross_val_(data, G_max, l, algo_KNN) Pr1.append(Precision) Re1.append(Recall) plotgraphs(t, Pr1, "Number of Suggestions", "Precision", "Precision Curve for KNN")