def __build_model(self): model_path = '{}{}'.format(self.file_prefix, self.model_path) try: model = joblib.load(model_path) print('recommender exists, load it') return model except Exception as e: print('recommender does not exist, build new recommender') # load data # initialize KNN recommender algo = KNNWithMeans(k=50, sim_options={ 'name': 'pearson_baseline', 'user_based': False }) # train model algo.fit(self.trainset) # save model joblib.dump(algo, model_path) # validation test_pred = algo.test(self.testset) accuracy.rmse(test_pred) return algo
def svdpp(trainset, testset, predset): modelname = 'svdpp' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01) print('SVDpp Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def baseline(trainset, testset, predset): modelname = 'baseline' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = BaselineOnly(bsl_options=bsl_options) print('Baseline Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def svd(trainset, testset, predset): modelname = 'svd' # Check if predictions already exist if is_already_predicted(modelname): return algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1) print('SVD Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def collaborative_filter(id, new_words): ratings_dict = calc_collaborative_param(new_words, id) df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is required. reader = Reader(rating_scale=(0.0, 5.0)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # define a cross-validation iterator kf = KFold(n_splits=3) algo = KNNBasic() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) kf_predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(kf_predictions, verbose=True) trainset = data.build_full_trainset() new_data = trainset.build_anti_testset() predictions = algo.test(new_data) top_n = get_top_n(predictions, n=3) with open('top_n.json', 'w') as fp: dump(top_n, fp, indent=4) return top_n
def rodar_modelo(data, teste_tamanho, sim_opcoes, k): treina, testa = train_test_split(data, teste_tamanho) knn = KNNBasic(k=k, sim_options=sim_opcoes) knn.fit(treina) knn_predicoes = knn.test(testa) accuracy.rmse(knn_predicoes) return knn
def train_trim_nmf(data, R): kfold = KFold(n_splits=10) rmse_list = [[], [], []] for k in range(2, 52, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] nmf = NMF(n_factors=k) for trainset, testset in kfold.split(data): nmf.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = nmf.test(p_testset) u_pred = nmf.test(u_testset) hv_pred = nmf.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("NMF with trim is finished!!") return rmse_list
def evaluate_model(self, data, algo): raw_ratings = data.raw_ratings # A = 90% of the data, B = 10% of the data threshold = int(.9 * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data.raw_ratings = A_raw_ratings # train data # retrain on the whole set A trainset = data.build_full_trainset() algo.fit(trainset) # Compute biased accuracy on A testset = trainset.build_testset() predictions = algo.test(testset) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True) print('len(predictions)') print(len(predictions)) # Compute unbiased accuracy on B testset = data.construct_testset( B_raw_ratings) # testset is now the set B predictions = algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True) print('len(predictions)') print(len(predictions))
def surpriseSVD(movieLensDataPath='data_clean.txt'): ''' Basic use of the surprise SVD algorithm. ''' ''' Params: movieLensDataPath is the path to the movielens data we're looking at. ''' ''' Note: replace with cleaned data. ''' ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.''' # Load the data as a pandas data frame, as reading from text didn't quite work at first. df = pd.read_csv(movieLensDataPath, sep="\t", header=None) df.columns = ["User Id", "Movie Id", "Rating"] # We need the rating scale. reader = Reader(rating_scale=(1, 5)) # The columns are User Id, Movie Id, and Rating. data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader) # To fit to the SVD algorithm, we have to convert it to a trainset. algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) # U and V! algop = algo.pu algoq = algo.qi # Simple crossvalidation kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) # Return U (pu) and V (qi) return algop, algoq
def DisplayGraphDelta(data) : """ Affichage du delta entre prédiction et réalité """ # Créer un jeu de test et de train ( 25%, 75%) trainset, testset = train_test_split(data, test_size=.25) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result =[] for prediction in predictions: print(prediction) # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les prediction et la réalité print(len(result)) plt.hist(result, 100) plt.show()
def grid(): raw_ratings = data.raw_ratings threshold = int(.9 * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data.raw_ratings = A_raw_ratings param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]} grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3) grid_search.fit(data) algo = grid_search.best_estimator['rmse'] # retrain on the whole set A trainset = data.build_full_trainset() algo.fit(trainset) # Compute biased accuracy on A predictions = algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # Compute unbiased accuracy on B testset = data.construct_testset(B_raw_ratings) # testset is now the set B predictions = algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions)
def main(): # Charge movielens-100k dataset data = Dataset.load_builtin('ml-100k') # Créer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(data, test_size=.15) # Détermine l'algorithme utilisé algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) result =[] for prediction in predictions: # Calcul le delta entre la prediction et la réalité result.append(prediction.r_ui - prediction.est) # Affiche l'histogramme du delta entre les predictions et la réalité plt.hist(result, 100) plt.show()
def slope_one(trainset, testset, predset): modelname = 'slopeone' # Check if predictions already exist if is_already_predicted(modelname): return algo = SlopeOne() print('SlopeOne Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def train_trim_knn(data, R): kfold = KFold(n_splits=10) sim_options = {'name': 'pearson'} rmse_list = [[], [], []] for k in range(2, 102, 2): print("using k = %d" % k) p_rmse = [] u_rmse = [] hv_rmse = [] knn = KNNWithMeans(k=k, sim_options=sim_options) for trainset, testset in kfold.split(data): knn.fit(trainset) (p_testset, u_testset, hv_testset) = trim(testset, R) p_pred = knn.test(p_testset) u_pred = knn.test(u_testset) hv_pred = knn.test(hv_testset) p_rmse.append(accuracy.rmse(p_pred)) u_rmse.append(accuracy.rmse(u_pred)) hv_rmse.append(accuracy.rmse(hv_pred)) rmse_list[0].append(np.mean(p_rmse)) rmse_list[1].append(np.mean(u_rmse)) rmse_list[2].append(np.mean(hv_rmse)) print("KNN with trim is finished!!") return rmse_list
def main(): # Charge movielens-100k dataset movielens_ds = Dataset.load_builtin('ml-100k') # Creer un jeu de test et de train ( 15%, 85%) trainset, testset = train_test_split(movielens_ds, test_size=.15) algo = KNNWithMeans() # Train sur le jeu de donnée trainset algo.fit(trainset) # Prediction sur le jeu de donnée testset predictions = algo.test(testset) # Affiche le RMSE accuracy.rmse(predictions) #print(predictions) result = [] for prediction in predictions: # Difference prediction et realite result.append(prediction.r_ui - prediction.est) # Histogramme du resultat plt.hist(result, 100) plt.show()
def run_svd(dataset): # Load the movielens_hetesage-100k dataset (download it if needed), data = Dataset.load_builtin(dataset) # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.33) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions) y_test = [item[2] for item in testset] preds = [pred[3] for pred in predictions] preds_round = np.rint(preds) rmse_round = np.sqrt(np.mean(np.square(np.array(preds_round - np.array(y_test))))) print(f'rmse_round {rmse_round}') utils.hist_plot(y_test, preds, preds_round)
def baseline(trainset, testset): algo = BaselineOnly() algo.fit(trainset) print("Predictions") predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) return(predictions)
def fit_rmse(algo, data): algo.fit(data.build_full_trainset()) dev_pred = algo.test(dev_dat.build_full_trainset().build_testset()) dev_rmse = accuracy.rmse(dev_pred, verbose = True) tr_rmse = accuracy.rmse(algo.test(data.build_full_trainset().build_testset()), verbose = True) print("rmse on dev_data: " , dev_rmse, "\n", "rmse on traning data: ", tr_rmse)
def svdalgorithm(trainset, testset): algo = SVD() algo.fit(trainset) print("Predictions") predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) return(predictions)
def eval_model(model): kf = KFold(n_splits=3) for trainset, testset in kf.split(data): #训练并预测 model.fit(trainset) predictions = model.test(testset) #计算RMSE accuracy.rmse(predictions, verbose=True)
def func2(): from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import train_test_split data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions)
def cross_validation(self, data, algo): # define a cross-validation iterator kf = KFold(n_splits=7, random_state=2) for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def evaluate_model(model: AlgoBase, test_set: [(int, int, float)]) -> dict: global fit_time starts = time.time() predictions = model.test(test_set) # print("It has been {0} seconds since the evaluation started".format(time.time() - starts)) metrics_dict = {} metrics_dict['RMSE'] = accuracy.rmse(predictions, verbose=False) metrics_dict['MAE'] = accuracy.rmse(predictions, verbose=False) metrics_dict['test_time'] = time.time() - starts metrics_dict['fit_time'] = fit_time return metrics_dict
def train_helper(algo, savename, trainset_cv, testset_cv, save=False): algo.fit(trainset_cv) print(f"{savename} on dev set:", end=" ") predictions_dev = algo.test(trainset_cv.build_testset()) rmse(predictions_dev, verbose=True) print(f"{savename} on test set:", end=" ") predictions_test = algo.test(testset_cv) rmse(predictions_test, verbose=True) if save: dump.dump(f"models/dump_{savename}_dev", predictions_dev, algo) dump.dump(f"models/dump_{savename}_test", predictions_test, algo)
def func6(): from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True)
def prediction(self, data, algo): trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_anti_testset() predictionsAll = algo.test(testset) print('Accuracy on whole data set,', end=' ') accuracy.rmse(predictionsAll, verbose=True) print('len(predictions)') print(len(predictionsAll)) return predictionsAll
def test_rmse(): """Tests for the RMSE function.""" predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)] assert rmse(predictions) == 0 predictions = [pred(0, 0), pred(0, 2)] assert rmse(predictions) == sqrt((0 - 2)**2 / 2) predictions = [pred(2, 0), pred(3, 4)] assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2) with pytest.raises(ValueError): rmse([])
def svd_train_test_split(): data = custom_pandas_100k() # Split data, training is 80% and test is 20% train_set, test_set = train_test_split(data, test_size=.20) algo = SVD() # Train on trainings et algo.fit(train_set) # Predict ratings for test set predictions = algo.test(test_set) # Compute RMSE accuracy.rmse(predictions)
def knn_baseline_movie(train, test, ids, Xtest, Xids): """ nearest neighbour approach using the movie baseline Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Baseline Movie') bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01} sim_option = { 'name': 'pearson_baseline', 'min_support': 1, 'user_based': False } algo = KNNBaseline(k=100, bsl_options=bsl_option, sim_options=sim_option, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def test_sanity_checks(u1_ml100k, pkf): """ Basic sanity checks for all algorithms: check that RMSE stays the same. """ expected_rmse = { BaselineOnly: 1.0268524031297395, KNNBasic: 1.1337265249554591, KNNWithMeans: 1.1043129441881696, KNNBaseline: 1.0700718041752253, KNNWithZScore: 1.11179436167853, SVD: 1.0077323320656948, SVDpp: 1.00284553561452, NMF: 1.0865370266372372, SlopeOne: 1.1559939123891685, CoClustering: 1.0841941385276614, } for klass, rmse in iteritems(expected_rmse): if klass in (SVD, SVDpp, NMF, CoClustering): algo = klass(random_state=0) else: algo = klass() trainset, testset = next(pkf.split(u1_ml100k)) algo.fit(trainset) predictions = algo.test(testset) assert accuracy.rmse(predictions, verbose=False) == rmse
""" This module describes how to use the train_test_split() function. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import train_test_split # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.25) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions)
from surprise import SVD from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') kf = KFold(n_splits=3) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)): print('fold number', i + 1) algo.fit(trainset_cv) print('On testset,', end=' ') predictions = algo.test(testset_cv) accuracy.rmse(predictions, verbose=True) print('On trainset,', end=' ') predictions = algo.test(trainset_cv.build_testset())
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)