def nmf_ratings_predicate(observed_ratings_df, truth_ratings_df, fold='0', phase='eval'): """ nmf_ratings Predicates """ print("NMF predicates") nmf_model = NMF() reader = Reader(rating_scale=(0.2, 1)) train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index( ).loc[:, ['userId', 'movieId', 'rating']], reader=reader) nmf_model.fit(train_dataset.build_full_trainset()) # make predictions predictions = pd.DataFrame(index=truth_ratings_df.index, columns=['rating']) for row in truth_ratings_df.loc[:, ['rating']].iterrows(): uid = row[0][0] iid = row[0][1] predictions.loc[(uid, iid), 'rating'] = nmf_model.predict(uid, iid).est write(predictions, 'nmf_rating_obs', fold, phase)
def nmf_algorithm() -> NMF: user_input = input( 'Do you want to continue with the default parameters? Y/N') if user_input.lower() == 'y': return NMF() else: n_factors = int(input('Enter total number of factors: ')) n_epochs = int(input('Enter number of epochs: ')) return NMF(n_factors, n_epochs)
def nmf_compute_high_var_trim_rmse(k): nmf = NMF(n_factors=k, random_state=42) rmse = [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf.fit(trainset) testset_trimmed = high_variance_trimming(testset, frequency, variance) pred = nmf.test(testset_trimmed) rmse.append(accuracy.rmse(pred, verbose=False)) print('k: %s | RMSE: %f' % (k, np.mean(rmse))) return np.mean(rmse)
def nmf_compute_prec_rec(t): precision, recall = [], [] for trainset, testset in KFold(n_splits=10, random_state=42).split(R): nmf = NMF(n_factors=nmf_best_k, random_state=42) nmf.fit(trainset) trimmed_testset = trim_unpopular_user(testset, t, threshold) pred = nmf.test(trimmed_testset) precision_dict, recall_dict = calculate_precision_recall( pred, t, threshold) precision.append(np.mean([prec for prec in precision_dict.values()])) recall.append(np.mean([rec for rec in recall_dict.values()])) return np.mean(precision), np.mean(recall)
def Q23(col=0): print('Chosen column is ' + str(col)) data = np.loadtxt('ml-latest-small/ratings.csv', delimiter=',', skiprows=1, usecols=(0, 1, 2)) row_userId = data[:, :1].astype(int) row_movieId = data[:, 1:2].astype(int) row_rating = data[:, 2:3] sortedId = np.sort(row_movieId.transpose()[0]) m = {} idx = 0 last = None for i in sortedId.tolist(): if i != last: m[i] = idx idx += 1 last = i data = load_data() model = NMF(n_factors=20) trainset, testset = train_test_split(data, test_size=0.0001) model.fit(trainset) U = model.pu V = model.qi import csv dict_ID_to_genre = {} with open('ml-latest-small/movies.csv', 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='|') cnt = 0 for row in reader: if cnt != 0: dict_ID_to_genre[row[0]] = row[1:] cnt += 1 dict_col_to_ID = {} for key in m: dict_col_to_ID[m[key]] = key V_col = V[:, col] V_col_sort_top10 = np.sort(V_col)[::-1][:10] V_col_list = V_col.tolist() for val in V_col_sort_top10: ind = V_col_list.index(val) m_id = dict_col_to_ID[ind] genre = dict_ID_to_genre[str(m_id)] print(genre[-1])
def NMF_trim_filter(ratings, dims, func, mv_dict): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 kf = KFold(n_splits=10, random_state=42) for k in range(len(dims)): nmf = NMF(n_factors=dims[k], random_state=42) test_rmse = np.array([]) test_mae = np.array([]) for trainset, testset in kf.split(data): nmf.fit(trainset) full_data = trainset.build_testset() + testset func(mv_dict, testset) pred = nmf.test(testset) test_rmse = np.append(test_rmse, accuracy.rmse(pred, verbose=False)) test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False)) RMSE[k] = np.mean(test_rmse) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(test_mae) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] print('For k = %i :' % dims[k]) print('RMSE: ', RMSE[k]) print('MAE: ', MAE[k]) plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def slot_select_algo_combobox(self): self.algo_change_flag=True self.algo_trained_flag=False algo_name=self.select_algo_comboBox.currentText() if algo_name=='SVD': self.algo=SVD() self.display_process_label.append('加载SVD模型...') elif algo_name=='SVD++': self.algo = SVDpp() self.display_process_label.append('加载SVD++模型...') elif algo_name == 'NMF': self.algo = NMF() self.display_process_label.append('加载NMF模型...') elif algo_name == 'Slope One': self.algo = SlopeOne() self.display_process_label.append('加载Slope One模型...') elif algo_name == 'k-NN': self.algo = KNNBasic() self.display_process_label.append('加载k-NN模型...') elif algo_name == 'Centered k-NN': self.algo = KNNWithMeans() self.display_process_label.append('加载Centered k-NN模型...') elif algo_name == 'k-NN Baseline': self.algo = KNNBaseline() self.display_process_label.append('加载k-NN Baseline模型...') elif algo_name == 'Co-Clustering': self.algo = CoClustering() self.display_process_label.append('加载Co-Clustering模型...') elif algo_name == 'Baseline': self.algo = BaselineOnly() self.display_process_label.append('加载Baseline模型...') elif algo_name == 'Random': self.algo = NormalPredictor() self.display_process_label.append('加载Random模型...')
def Q15and22and29(qNum, bestK, thres=[2.5, 3, 3.5, 4]): range = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) if qNum == 15: model = KNNWithMeans(bestK, sim_options=sim_options) elif qNum == 22: model = NMF(n_factors=bestK) else: model = SVD(n_factors=bestK) model.fit(trainset) pred = model.test(testset) for thrs in thres: np_true = np.array([]) np_score = np.array([]) for u, i, t, p, d in pred: if t >= thrs: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / range) title = 'Threshold ' + str(thrs) plot_ROC(np_true, np_score, title=title)
def problem_17_rmse_mae_full_dataset(): x_axis = range(2, 52, 2) dim = len(x_axis) rmse_test_store = np.zeros(dim) mae_test_store = np.zeros(dim) for i in x_axis: algo = NMF(i, verbose=False) # i = number of latent factors result = cross_validate(algo, data, measures=['rmse', 'mae'], cv=10, verbose=True) rmse_score = np.mean(result['test_rmse']) mae_score = np.mean(result['test_mae']) ##################### Index to store values in rmse and mae ################### ind = int(i / 2 - 1) rmse_test_store[ind] = rmse_score mae_test_store[ind] = mae_score pd.DataFrame(rmse_test_store).to_csv("rmse_test_store_10.csv") pd.DataFrame(mae_test_store).to_csv("mae_test_store_10.csv") plotgraphs(x_axis, rmse_test_store, 'K', 'Mean rmse scores', 'Plot', 'q17_rmse.png') plotgraphs(x_axis, mae_test_store, 'K', 'Mean Mae scores', 'Plot', 'q17_Mae.png')
def rank_predictions(model_name): k_KNN = 22 k_NNMF = 20 k_MF = 26 if model_name == 'KNN': sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 } model = KNNWithMeans(k_KNN, sim_options=sim_options) elif model_name == 'NNMF': model = NMF(n_factors= k_NNMF) else: model = SVD(n_factors = k_MF) precision_arr = [] recall_arr = [] for t in range (1,26): kf = KFold(n_splits=10) print(t) p = [] r = [] for trainSet, testSet in kf.split(data): model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall (predictions, t) p.append(sum(prec for prec in precisions.values()) / len(precisions)) r.append(sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(np.array(p))) recall_arr.append(np.mean(np.array(r))) # precision vs t plt.plot(list(range (1,26)), precision_arr) plt.xlabel("Size") plt.ylabel("Precision") plt.title("The average precision plot using " + model_name) plt.show() # recall vs t plt.plot(list(range (1,26)), recall_arr) plt.xlabel("Size") plt.ylabel("Recall") plt.title("The average recall plot using MF " + model_name) plt.show() # precision vs recall plt.plot(recall_arr, precision_arr) plt.xlabel("Recall") plt.ylabel("Precision") plt.title("The average precision and recall plot using " + model_name) plt.show() return precision_arr, recall_arr
def __init__(self,category,save_name): self.category=category self.max_user=10000 #maximum number of user self.price_dict={} self.price_dict_temp={} self.cate_dict={} self.cate_dict_temp={} self.top_value=15 # top x features in SVD self.model=NMF() self.topk=500 #maximum items in each category, finding the top k popular self.max_price={} self.save_path= os.path.join("..", "feature", save_name) if not os.path.isfile(self.save_path): self.load_data() #load raw data #self.create_user_item_matrix() self.create_ratings() self.gen_new_price_dict() self.save_data(self.save_path) #save the feature else: self.load(self.save_path) #load the feature
def train_nmf(data): rmse = [] mae = [] sim_options = {'name': 'pearson'} for k in range(2, 52, 2): print("using k = %d" % k) nmf = NMF(n_factors=k) temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10) rmse.append(np.mean(temp['test_rmse'])) mae.append(np.mean(temp['test_mae'])) print("k-fold validation finished!") return (rmse, mae)
def Q19to21(qNum): data = load_data() kf = KFold(n_splits=10) trimFun = {12: popularTrim, 13: unpopularTrim, 14: highVarTrim} RMSE = [] for k in range(2, 20, 2): nmf = NMF() subRMSE = [] for trainSet, testSet in kf.split(data): subsubRMSE = 0 nmf.fit(trainSet) testSet = trimFun[qNum](testSet) nTest = len(testSet) print("test set size after trimming: %d", nTest) predictions = nmf.test(testSet) for p in predictions: subsubRMSE += pow(p.est - p.r_ui, 2) # average of all train-test splits of k-NN RMSE.append(np.mean(subRMSE)) return RMSE
def train_trim_nmf(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] nmf = NMF(n_factors=k) for trainset, testset in kfold.split(data): nmf.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = nmf.test(p_testset) u_pred = nmf.test(u_testset) hv_pred = nmf.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("NMF with trim is finished!!") return rmse_list
def Q34(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } data = load_data() trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN', 'NNMF', 'SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p / rang) fpr, tpr, thresholds = roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['KNN', 'NNMF', 'SVD'], fp, tp, area): fpr = fp[f] tpr = tp[t] # label = mod+'ROC curve (area = '+str(roc_auc)+'0.2f)' plt.plot(fpr, tpr, lw=lw, label='%s ROC curve (area = %0.2f)' % (mod, roc_auc)) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend(loc="lower right") plt.show() plt.close()
def use_nmf(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using NMF') algo_NMF = NMF() algo_NMF.fit(trainset) testset = trainset.build_anti_testset() predictions_NMF = algo_NMF.test(testset) accuracy_rmse = accuracy.rmse(predictions_NMF) accuracy_mae = accuracy.mae(predictions_NMF) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def NMF_bin_pre(ratings, ts, nmf_fac, thrd): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=ts) algo = NMF(n_factors=nmf_fac, random_state=42) algo.fit(trainset) pre = algo.test(testset) true_rating = np.empty(len(pre)) pred_rating = np.empty(len(pre)) for i in range(len(pre)): true_rating[i] = pre[i][2] pred_rating[i] = pre[i][3] bi_rating = np.empty(len(pre)) one_idx = true_rating >= thrd zero_idx = true_rating < thrd bi_rating[one_idx] = 1.0 bi_rating[zero_idx] = 0.0 return bi_rating, pred_rating
def trim_performance(qNum,maxk=0): pop, unpop, highVar = trimMovies() if maxk == 0: if 12 <= qNum <= 14: maxk = 100 elif 19 <= qNum <= 21: maxk = 50 trim_Model = { 12: (pop, 'KNNWithMeans'), 13: (unpop, 'KNNWithMeans'), 14: (highVar, 'KNNWithMeans'), 19: (pop, 'NMF'), 20: (unpop, 'NMF'), 21: (highVar, 'NMF'), } trimSet, modelName = trim_Model[qNum] kf = KFold(n_splits=10) RMSE = [] for k in range(2, maxk + 1, 2): print('-' * 20 + 'k = ' + str(k) + ' ' + '-' * 20) if modelName == 'KNNWithMeans': model = KNNWithMeans(k=k, sim_options={'name': 'pearson'}) elif modelName == 'NMF': model = NMF(n_factors=k) subRMSE = [] temp = 1 for trainSet, testSet in kf.split(data): model.fit(trainSet) testSet = list(filter(lambda x: int(x[1]) in trimSet, testSet)) print("Split " + str(temp) + ": test set size after trimming: %d", len(testSet)) temp += 1 predictions = model.test(testSet) subRMSE.append(accuracy.rmse(predictions, verbose=True)) RMSE.append(np.mean(subRMSE)) plt.figure() plt.plot(list(range(2, maxk+1, 2)), RMSE) plt.xlabel("k") plt.ylabel("Average RMSE") plt.title("Q"+str(qNum)+": Average RMSE Along k") plt.show() print(min(RMSE)) return min(RMSE)
def Q17(): data = load_data() meanRMSE, meanMAE = [], [] start = time.time() for k in range(16, 24, 2): nmf = NMF(n_factors=k) out = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10) meanRMSE.append(np.mean(out['test_rmse'])) meanMAE.append(np.mean(out['test_mae'])) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) print("Total time used for cross validation: " + cv_time) k = list(range(16, 24, 2)) ys = [[meanRMSE, 'mean RMSE'], [meanMAE, 'mean MAE']] make_plot(k, ys, 'Number of Latent Factors', 'ratings') return meanRMSE, meanMAE
def plot_all_ROC(): rang = 5.0 sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } trainset, testset = train_test_split(data, test_size=0.1) knn = KNNWithMeans(22, sim_options=sim_options) nmf = NMF(n_factors=18) svd = SVD(n_factors=8) fp = {} tp = {} area = np.array([]) for model, key in zip([knn, nmf, svd], ['KNN','NNMF','SVD']): model.fit(trainset) pred = model.test(testset) np_true = np.array([]) np_score = np.array([]) for _, _, t, p, _ in pred: if t >= 3: t = 1 else: t = 0 np_true = np.append(np_true, t) np_score = np.append(np_score, p/rang) fpr, tpr, thresholds = metrics.roc_curve(np_true, np_score) print(fpr.shape, tpr.shape) roc_auc = metrics.auc(fpr, tpr) fp[key] = fpr tp[key] = tpr area = np.append(area, roc_auc) plt.figure() lw = 2 for mod, f, t, roc_auc in zip(['k-NN','NNMF','MF'], fp, tp, area): fpr = fp[f] tpr = tp[t] plt.plot(fpr, tpr, lw=lw, label='%s'%mod) plt.plot([0, 1], [0, 1], lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curves') plt.legend() plt.show()
def grid_search(surprise_model): if type(surprise_model()) == type(SVDpp()): param_grid = {'n_factors':[20] , 'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(SVD()): param_grid = {'n_epochs':[20], 'lr_all':[0.005, 0.007, 0.05, 0.07, 0.5, 0.7, 1.0], 'reg_all':[0.02, 0.05, 0.2, 0.5]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(NMF()): param_grid = {'n_epochs':[20], 'reg_pu':[0.02, 0.04, 0.06, 0.08, 0.2], 'reg_qi':[0.02, 0.04, 0.06, 0.08, 0.2]} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) elif type(surprise_model()) == type(BaselineOnly()): param_grid = {'bsl_options': {'method': ['als', 'sgd'], 'reg': [1, 2], 'learning_rate': [0.005, 0.05, 0.5, 1.0]}} gs = GridSearchCV(surprise_model, param_grid, measures=['rmse', 'mae'], cv=3,n_jobs=-1,joblib_verbose=1,refit=True) return gs
def NMF_filter(ratings, dims): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 for k in range(len(dims)): nmf = NMF(n_factors=dims[k], biased=False) cv = cross_validate(algo=nmf, data=data, measures=['RMSE', 'MAE'], cv=10, verbose=True) RMSE[k] = np.mean(cv['test_rmse']) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(cv['test_mae']) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def nmf(dataName, data, biased=True): print('Start building NMF with ' + dataName + '!') for i, k in enumerate(ks): nmf = NMF(n_factors=k, biased=biased) scores = cross_validate(nmf, data, cv=10) mae[i] = scores['test_mae'].mean() rmse[i] = scores['test_rmse'].mean() print('k = ' + str(k) + ' finished!') plt.figure() plt.subplot(211) plt.plot(ks, mae) plt.xlabel('k') plt.ylabel('mean absolute error') plt.title('Mean absolute error vs. k of ' + dataName) plt.subplot(212) plt.plot(ks, rmse) plt.xlabel('k') plt.ylabel('root mean squared error') plt.title('Root mean squared error vs. k of ' + dataName) print('mae:') print(mae) print('rmse:') print(rmse) print('Finish building NMF with ' + dataName + '!')
def Q12To14And19To21And26To28(qNum, maxk=None): data = load_data() kf = KFold(n_splits=10) if maxk is None: if 12 <= qNum <= 14: maxk = 100 elif 19 <= qNum <= 21: maxk = 50 elif 26 <= qNum <= 28: maxk = 50 pop, unpop, highVar = classifyMovies() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } trimAndModel = { 12: (pop, 'KNNWithMeans'), 13: (unpop, 'KNNWithMeans'), 14: (highVar, 'KNNWithMeans'), 19: (pop, 'NMF'), 20: (unpop, 'NMF'), 21: (highVar, 'NMF'), 26: (pop, 'SVD'), 27: (unpop, 'SVD'), 28: (highVar, 'SVD') } RMSE = [] # RMSE for each k for k in range(2, maxk + 1, 2): # inclusive print('-' * 20 + ' k = ' + str(k) + ' ' + '-' * 20) trimSet, modelName = trimAndModel[qNum] if modelName == 'KNNWithMeans': model = KNNWithMeans(k, sim_options=sim_options) elif modelName == 'NMF': model = NMF(n_factors=k) else: model = SVD(n_factors=k) subRMSE = [] # RMSE for each k for each train-test split iter = 1 for trainSet, testSet in kf.split(data): subsubRMSE = 0 model.fit(trainSet) testSet = list(filter(lambda x: x[1] in trimSet, testSet)) nTest = len(testSet) print("Split " + str(iter) + ": test set size after trimming: %d", nTest) iter += 1 predictions = model.test(testSet) for p in predictions: subsubRMSE += pow(p.est - p.r_ui, 2) # calculate RMSE of this train-test split subRMSE.append(np.sqrt(subsubRMSE / nTest)) # average of all train-test splits of k-NN for this k RMSE.append(np.mean(subRMSE)) # plotting k = list(range(2, maxk + 1, 2)) ys = [[RMSE, 'RMSE']] xTitle = 'Number of Neighbors' if qNum <= 14 else 'Number of latent factors' make_plot(k, ys, xTitle, 'Error') return RMSE
plt.savefig('plot/q22_nmf_roc_' + str(threshold) + '.png') plt.clf() if __name__ == "__main__": threshold = [2.5, 3, 3.5, 4] file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True} trainset, testset = train_test_split(data, test_size=0.1) for th in threshold: algo = NMF(n_factors=16) algo.fit(trainset) predictions = algo.test(testset) y_true = [] y_estimate = [] for row in predictions: if row[2] >= th: y_true.append(1) else: y_true.append(0) y_estimate.append(row[3]) plot_roc(y_true, y_estimate, th)
threshold = 3 file_path = os.path.expanduser("ml-latest-small/ratings_new.csv") reader = Reader(sep=',') data = Dataset.load_from_file(file_path, reader=reader) sim_options = {'name': 'pearson', 'user_based': True } trainset, testset = train_test_split(data, test_size=0.1) algo = KNNWithMeans(k=34, sim_options=sim_options) algo.fit(trainset) predictions1 = algo.test(testset) algo = NMF(n_factors=16) algo.fit(trainset) predictions2 = algo.test(testset) algo = SVD(n_factors=14) algo.fit(trainset) predictions3 = algo.test(testset) y_true = [] y_estimate1 = [] y_estimate2 = [] y_estimate3 = [] for row in predictions1: if row[2] >= threshold: y_true.append(1)
print "Fold for" + str(t) algo.fit(trainset) # print testset predictions = algo.test(testset) Prec, Reca = metrics(predictions, t) pr = pr + Prec re = re + Reca return pr / 10.0, re / 10.0 if __name__ == '__main__': data = retrieve_data() G_max = ret_mod_user_dict(data) algo_NMF = NMF(NMF_no_of_LF, verbose=False) algo_SVD = SVD(n_factors=MF_no_of_LF) algo_KNN = KNNWithMeans(k=KNN_no_of_LF, sim_options=sim_options, verbose=False) # Q36 Pr1 = [] Re1 = [] t = list(range(1, 26)) for l in t: Precision, Recall = cross_val_(data, G_max, l, algo_KNN) Pr1.append(Precision) Re1.append(Recall) plotgraphs(t, Pr1, "Number of Suggestions", "Precision",
def Q36To38(qNum): print("problem ", qNum) data = load_data() sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } filter = { 36: 'KNNWithMeans', 37: 'NMF', 38: 'SVD', } k_KNNWithMeans = 30 # from Q11 k_NMF = 18 # from Q18 k_SVD = 8 # from Q25 modelName = filter[qNum] if modelName == 'KNNWithMeans': model = KNNWithMeans(k_KNNWithMeans, sim_options=sim_options) elif modelName == 'NMF': model = NMF(n_factors=k_NMF) else: model = SVD(n_factors=k_SVD) # sweep t from 1 to 25 precision_arr = [] recall_arr = [] for t in range(1, 26): kf = KFold(n_splits=10) for trainSet, testSet in kf.split(data): sub_precisions = 0.0 sub_recalls = 0.0 model.fit(trainSet) predictions = model.test(testSet) precisions, recalls = precision_recall(predictions, t) print(sum(prec for prec in precisions.values()) / len(precisions)) sub_precisions += (sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls)) sub_recalls += (sum(rec for rec in recalls.values()) / len(recalls)) precision_arr.append(np.mean(sub_precisions)) recall_arr.append(np.mean(sub_recalls)) t_list = list(range(1, 26)) ys = [[precision_arr, 'mean precisions'], [recall_arr, 'mean recalls']] print("model name: ", modelName) # make_plot(t_list, ys, 'recommended item size t','Precision') # precision vs t title_ = "precision vs t for: " + modelName make_plot(t_list, [[precision_arr, 'mean precisions']], 'recommended item size t', 'Precision', title=title_) # recall vs t title_ = "recall vs t for: " + modelName make_plot(t_list, [[recall_arr, 'mean recalls']], 'recommended item size t', 'Recall', title=title_) # precision vs recall title_ = "precision vs recall for: " + modelName #make_plot([recall_arr, 'mean recalls'], [[precision_arr, 'mean precisions']], 'Recall','Precision', title = title_) plt.plot(recall_arr, precision_arr, label=modelName) xlabel = "recall" ylabel = "precision" plt.xlabel(xlabel) plt.ylabel(ylabel) plt.legend() plt.grid() plt.title(title_) plt.show() return precision_arr, recall_arr
movie_rating_map = defaultdict(list) for val in data: movie_rating_map[val[1]].append(val[2]) high_var_data = [val for val in data if len(movie_rating_map[val[1]]) >= 5 and np.var(movie_rating_map[val[1]]) >= 2.0] return high_var_data print("=====================Non-negative Matrix Factorization based filtering=============================================================") print("Evaluating NNMF collaborative filtering based on #of latent factors vs RMSE and MAE errors on 10folds cross-validation") k_range = range(2, 51, 2) avg_rmse, avg_mae = [], [] for k in k_range: algo = NMF(n_factors=k) cv_result = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=False) avg_rmse.append(np.mean(cv_result['test_rmse'])) avg_mae.append(np.mean(cv_result['test_mae'])) plt.plot(k_range, avg_rmse, label="Average RMSE") plt.plot(k_range, avg_mae, label="Average MAE") plt.xlabel('Number of latent factors', fontsize=15) plt.ylabel('Error', fontsize=15) plt.legend() plt.show() print("=================================Optimal Number of Latent Factors=============================================================") all_genres = set('|'.join(movies.genres).split('|')) print('#of Genres - ', len(all_genres))
line_format='user item rating', sep=',', rating_scale=(1, 5), skip_lines=0) RS_data = Dataset.load_from_df(RS_ratings, RS_reader) # Benchmark_Algorithm_Metric benchmark = [] for algorithm in [ BaselineOnly(), CoClustering(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), NMF(), NormalPredictor(), SlopeOne(), SVD(), SVDpp() ]: # Perform cross validation results = cross_validate(algorithm, RS_data, measures=['rmse', 'mae', 'mse', 'fcp'], cv=5, verbose=True) # Results To Serie List tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],