def LoadMovieLensData(): ml = MovieLens() print("Loading movie ratings...") data = ml.loadMovieLensLatestSmall() print( "\nComputing movie popularity ranks so we can measure novelty later..." ) rankings = ml.getPopularityRanks() return (ml, data, rankings) np.random.seed(0) random.seed(0) # Load up common data set for the recommender algorithms (ml, evaluationData, rankings) = LoadMovieLensData() # Construct an Evaluator to, you know, evaluate them evaluator = Evaluator(evaluationData, rankings) contentKNN = ContentKNNAlgorithm() evaluator.AddAlgorithm(contentKNN, "ContentKNN") # Just make random recommendations Random = NormalPredictor() evaluator.AddAlgorithm(Random, "Random") evaluator.Evaluate(False) evaluator.SampleTopNRecs(ml)
sum(rec for rec in recalls_combined.values()) / len(recalls_combined)) print("averaged_precision for SVD algorithm:") print(sum(averaged_precision_SVD) / len(averaged_precision_SVD)) print("averaged_recall for SVD algorithm:") print(sum(averaged_recall_SVD) / len(averaged_recall_SVD)) print("averaged_precision for KNN algorithm:") print(sum(averaged_precision_KNN) / len(averaged_precision_KNN)) print("averaged_recall for KNN algorithm:") print(sum(averaged_recall_KNN) / len(averaged_recall_KNN)) print("averaged_precision for combined algorithm:") print(sum(averaged_precision_combined) / len(averaged_precision_combined)) print("averaged_recall for combined algorithm:") print(sum(averaged_recall_combined) / len(averaged_recall_combined)) #We compare our results with a random Predictor algo_random = NormalPredictor() algo_random.fit(trainset) prediction_random = algo_random.test(testset) #compare rmse and mae of the different algorithms rmse(predictions_SVD) rmse(predictions_KNN) rmse(predictions_combined) rmse(prediction_random) mae(predictions_SVD) mae(predictions_KNN) mae(predictions_combined) mae(prediction_random)
from surprise import accuracy from surprise.model_selection import PredefinedKFold from surprise.model_selection import cross_validate from sklearn.metrics import mean_absolute_error, mean_squared_error from tqdm import tqdm import math #The comparsion script, compare the Random, KNN, and SVD # The training and prediction(without rating) is using full training set. data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = KNNBasic(user_based=False, k=17) algo.fit(trainset) algo1 = SVD() algo1.fit(trainset) algo2 = NormalPredictor() algo2.fit(trainset) random = [] knn = [] svd = [] with open("ml-100k/u.data", encoding='ISO-8859-1') as f: #<-- use full set info = f.readlines() infoList = tqdm(info) for message in infoList: message = message.split("\t") message = message[0:3] u = message[0] m = message[1] knnPred = algo.predict(str(u), str(m))
reader = Reader(rating_scale=(0, 489)) # load data as Dataset for surprise library data = Dataset.load_from_df(orig_data[['Customer', 'Product', 'Duration']], reader) # In[10]: benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation print('Executing' + str(algorithm)) results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
def train_surprise(self, model_type, trainset, testset, k_recommend, sql_db, k_fold, knowledge, model_name, result_name, system_eval=False): knn_user_based = self.config['SURPRISE_KNN'].getboolean( 'knn_user_based') knn_similarity = self.config['SURPRISE_KNN']['knn_similarity'] sim_options = {'name': knn_similarity, 'user_based': knn_user_based} verbose_switch = self.config['DEFAULT'].getboolean('verbose_switch') # Selección de modelo a utilizar if (model_type == "svd"): # Obtener valores de configuracion svd_grid_search = self.config['SURPRISE_SVD'].getboolean( 'svd_grid_search') svd_grid_metric = self.config['SURPRISE_SVD']['svd_grid_metric'] svd_n_factors = int(self.config['SURPRISE_SVD']['svd_n_factors']) svd_n_epochs = int(self.config['SURPRISE_SVD']['svd_n_epochs']) svd_biased = self.config['SURPRISE_SVD'].getboolean('svd_biased') svd_init_mean = float(self.config['SURPRISE_SVD']['svd_init_mean']) svd_init_std_dev = float( self.config['SURPRISE_SVD']['svd_init_std_dev']) svd_lr_all = float(self.config['SURPRISE_SVD']['svd_lr_all']) svd_reg_all = float(self.config['SURPRISE_SVD']['svd_reg_all']) if (self.common_functions.validate_available_sql_data( 'svd_params', sql_db) == True): results = pd.read_sql_query('select * from svd_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "svd")] if (real_results.empty == False): svd_n_factors = int(real_results.iloc[0]['svd_n_factors']) svd_n_epochs = int(real_results.iloc[0]['svd_n_epochs']) svd_init_std_dev = float( real_results.iloc[0]['svd_init_std_dev']) svd_lr_all = float(real_results.iloc[0]['svd_lr_all']) svd_reg_all = float(real_results.iloc[0]['svd_reg_all']) algo = SVD(n_factors=svd_n_factors, n_epochs=svd_n_epochs, biased=svd_biased, init_mean=svd_init_mean, init_std_dev=svd_init_std_dev, lr_all=svd_lr_all, reg_all=svd_reg_all, verbose=verbose_switch) elif (model_type == "SVDpp"): # Obtener valores de configuracion svdpp_grid_search = self.config['SURPRISE_SVDPP'].getboolean( 'svdpp_grid_search') svdpp_grid_metric = self.config['SURPRISE_SVDPP'][ 'svdpp_grid_metric'] svdpp_n_factors = int( self.config['SURPRISE_SVDPP']['svdpp_n_factors']) svdpp_n_epochs = int( self.config['SURPRISE_SVDPP']['svdpp_n_epochs']) svdpp_init_mean = float( self.config['SURPRISE_SVDPP']['svdpp_init_mean']) svdpp_init_std_dev = float( self.config['SURPRISE_SVDPP']['svdpp_init_std_dev']) svdpp_lr_all = float(self.config['SURPRISE_SVDPP']['svdpp_lr_all']) svdpp_reg_all = float( self.config['SURPRISE_SVDPP']['svdpp_reg_all']) if (self.common_functions.validate_available_sql_data( 'svdpp_params', sql_db) == True): results = pd.read_sql_query('select * from svdpp_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "svdpp")] if (real_results.empty == False): svdpp_n_factors = int( real_results.iloc[0]['svdpp_n_factors']) svdpp_n_epochs = int( real_results.iloc[0]['svdpp_n_epochs']) svdpp_init_std_dev = float( real_results.iloc[0]['svdpp_init_std_dev']) svdpp_lr_all = float(real_results.iloc[0]['svdpp_lr_all']) svdpp_reg_all = float( real_results.iloc[0]['svdpp_reg_all']) algo = SVDpp(n_factors=svdpp_n_factors, n_epochs=svdpp_n_epochs, init_mean=svdpp_init_mean, init_std_dev=svdpp_init_std_dev, lr_all=svdpp_lr_all, reg_all=svdpp_reg_all, verbose=verbose_switch) elif (model_type == "NMF"): # Obtener valores de configuracion nmf_grid_search = self.config['SURPRISE_NMF'].getboolean( 'nmf_grid_search') nmf_grid_metric = self.config['SURPRISE_NMF']['nmf_grid_metric'] nmf_n_factors = int(self.config['SURPRISE_NMF']['nmf_n_factors']) nmf_n_epochs = int(self.config['SURPRISE_NMF']['nmf_n_epochs']) nmf_biased = self.config['SURPRISE_NMF'].getboolean('nmf_biased') nmf_reg_pu = float(self.config['SURPRISE_NMF']['nmf_reg_pu']) nmf_reg_qi = float(self.config['SURPRISE_NMF']['nmf_reg_qi']) nmf_reg_bu = float(self.config['SURPRISE_NMF']['nmf_reg_bu']) nmf_reg_bi = float(self.config['SURPRISE_NMF']['nmf_reg_bi']) nmf_lr_bu = float(self.config['SURPRISE_NMF']['nmf_lr_bu']) nmf_lr_bi = float(self.config['SURPRISE_NMF']['nmf_lr_bi']) nmf_init_low = float(self.config['SURPRISE_NMF']['nmf_init_low']) nmf_init_high = int(self.config['SURPRISE_NMF']['nmf_init_high']) if (self.common_functions.validate_available_sql_data( 'nmf_params', sql_db) == True): results = pd.read_sql_query('select * from nmf_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "nmf")] if (real_results.empty == False): nmf_n_factors = int(real_results.iloc[0]['nmf_n_factors']) nmf_n_epochs = int(real_results.iloc[0]['nmf_n_epochs']) nmf_reg_pu = float(real_results.iloc[0]['nmf_reg_pu']) nmf_reg_qi = float(real_results.iloc[0]['nmf_reg_qi']) nmf_init_low = float(real_results.iloc[0]['nmf_init_low']) algo = NMF(n_factors=nmf_n_factors, n_epochs=nmf_n_epochs, biased=nmf_biased, reg_pu=nmf_reg_pu, reg_qi=nmf_reg_qi, reg_bu=nmf_reg_bu, reg_bi=nmf_reg_bi, lr_bu=nmf_lr_bu, lr_bi=nmf_lr_bi, init_low=nmf_init_low, init_high=nmf_init_high, verbose=verbose_switch) elif (model_type == "NormalPredictor"): algo = NormalPredictor() elif (model_type == "BaselineOnly"): algo = BaselineOnly(verbose=verbose_switch) elif (model_type == "KNNBasic"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnbasic_params', sql_db) == True): results = pd.read_sql_query('select * from knnbasic_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "knnbasic")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNBasic(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNWithMeans"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnwithmeans_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnwithmeans_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "knnwithmeans")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNWithMeans(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNWithZScore"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnwithzscore_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnwithzscore_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "knnwithzscore")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNWithZScore(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "KNNBaseline"): # Obtener valores de configuracion knn_k = int(self.config['SURPRISE_KNN']['knn_k']) knn_min_k = int(self.config['SURPRISE_KNN']['knn_min_k']) knn_grid_search = self.config['SURPRISE_KNN'].getboolean( 'knn_grid_search') knn_grid_metric = self.config['SURPRISE_KNN']['knn_grid_metric'] if (self.common_functions.validate_available_sql_data( 'knnbaseline_params', sql_db) == True): results = pd.read_sql_query( 'select * from knnbaseline_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & (results["algorithm"] == "knnbaseline")] if (real_results.empty == False): knn_k = int(real_results.iloc[0]['knn_k']) knn_min_k = int(real_results.iloc[0]['knn_min_k']) algo = KNNBaseline(k=knn_k, min_k=knn_min_k, sim_options=sim_options, verbose=verbose_switch) elif (model_type == "SlopeOne"): algo = SlopeOne() elif (model_type == "CoClustering"): # Obtener valores de configuracion cc_grid_search = self.config['SURPRISE_COCLUSTERING'].getboolean( 'cc_grid_search') cc_grid_metric = self.config['SURPRISE_COCLUSTERING'][ 'cc_grid_metric'] cc_n_cltr_u = int( self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_u']) cc_n_cltr_i = int( self.config['SURPRISE_COCLUSTERING']['cc_n_cltr_i']) cc_n_epochs = int( self.config['SURPRISE_COCLUSTERING']['cc_n_epochs']) if (self.common_functions.validate_available_sql_data( 'coclustering_params', sql_db) == True): results = pd.read_sql_query( 'select * from coclustering_params;', sql_db, index_col='index') real_results = results[(results["knowledge"] == knowledge) & ( results["algorithm"] == "coclustering")] if (real_results.empty == False): cc_n_cltr_u = int(real_results.iloc[0]['cc_n_cltr_u']) cc_n_cltr_i = int(real_results.iloc[0]['cc_n_cltr_i']) cc_n_epochs = int(real_results.iloc[0]['cc_n_epochs']) algo = CoClustering(n_cltr_u=cc_n_cltr_u, n_cltr_i=cc_n_cltr_i, n_epochs=cc_n_epochs, verbose=verbose_switch) else: return { "status": False, "result": "Defined model_type does not exist" } st = default_timer() print("STARTING to train model: " + str(model_name)) algo.fit(trainset) train_model_runtime = default_timer() - st # Almacenar tiempo de proceso en base de datos self.common_functions.save_process_time( st, event=str(model_name) + "_training", description="Time for model to be trained on dataset") # Guardar modelo # Crear directorio si no existe if (os.path.isdir(self.models_path + model_name) == False): try: os.makedirs(self.models_path + model_name) except OSError as e: if e.errno != errno.EEXIST: return {"status": False, "result": e} # Almacenar modelo en file system #file_name = self.models_path+model_name+"/model" #dump.dump(file_name, algo=algo) st = default_timer() print("STARTING to generate predictions with the trained model: " + str(model_name)) predictions = algo.test(testset) runtime = default_timer() - st print( "Tiempo de ejecucion total de la generacion de predicciones para Surprise Time:", round(runtime, 2)) self.common_functions.save_process_time( st, event=str(model_name) + "_generate_recommendations", description="Time for predictions to be generated using the model") # Guardar predicciones para hibridación # Crear directorio si no existe if (os.path.isdir(self.models_path + model_name + "/predictions/" + str(k_fold)) == False): try: os.makedirs(self.models_path + model_name + "/predictions/" + str(k_fold)) except OSError as e: if e.errno != errno.EEXIST: return {"status": False, "result": e} # Almacenar predicciones para hibridación eval_result = pd.DataFrame( columns=['user_id', 'item_id', 'r_ui', 'est']) for uid, iid, true_r, est, _ in predictions: eval_result = eval_result.append( { 'user_id': uid, 'item_id': iid, 'r_ui': true_r, 'est': est }, ignore_index=True) eval_result.to_csv(path_or_buf=self.models_path + model_name + "/predictions/" + str(k_fold) + "/predictions.csv", encoding='latin1', sep=str(u';').encode('utf-8'), index=False) # --------------------------- if (system_eval == False): # Procesar y evaluar las recomendaciones para el modelo st = default_timer() print("STARTING to evaluate recommendations with model: " + str(model_name)) process_evaluate_result = self.evaluation.surprise_process_evaluate( predictions, knowledge, model_name, result_name, train_model_runtime, k_recommend, sql_db, k_fold, is_surprise=True) # Almacenar tiempo de proceso en base de datos self.common_functions.save_process_time( st, event=str(model_name) + "_evaluate_model", description="Time for model to be evaluated in test dataset") if (process_evaluate_result["status"] == True): del (process_evaluate_result) return {"status": True, "result": ""} else: del (process_evaluate_result) return { "status": False, "result": "no se pudo ejecutar correctamente content_explicit" } else: print("decide what to do") #result_model.save(self.models_path+model) return {"status": True, "result": ""}
""" This module descibes how to load a dataset from a pandas dataframe. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise import Reader # Dummy algo algo = NormalPredictor() # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) data.split(2) # data can now be used normally for trainset, testset in data.folds(): algo.fit(trainset)
# * using reader to be able to deal with the imported CSV reader = Reader( line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1 ) # * loading the csv data = Dataset.load_from_file( file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader ) # * dividing in train and test sets trainset, testset = train_test_split(data, test_size=0.25) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing Normal Predictor as algorithm algo = NormalPredictor() # * Train the algorithm on the trainset, and predict ratings for the testset for trainset, testset in kf.split(data): predictions = algo.fit(trainset).test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.mse(predictions) accuracy.fcp(predictions) print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls)) df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"]) df["err"] = abs(df.est - df.rui) df.to_csv("predictions_Normal.csv")
N = len(R) M = len(R[0]) K = 5 P = numpy.random.rand(N, K) Q = numpy.random.rand(M, K) nP, nQ = matrix_factorization(R, P, Q, K) nR = numpy.dot(nP, nQ.T) data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = KNNBasic(user_based=False, k=17) algo.fit(trainset) algo1 = SVD() algo1.fit(trainset) algo2 = NormalPredictor() algo2.fit(trainset) mf = [] knn = [] svd = [] random = [] with open("ml-100k/u.data", encoding='ISO-8859-1') as f: #<-- use full set info = f.readlines() infoList = tqdm(info) for message in infoList: message = message.split("\t") message = message[0:3] mf.append(int(nR[int(message[0]) - 1][int(message[1]) - 1])) knnPred = algo.predict(message[0], message[1]) svdPred = algo1.predict(message[0], message[1])
doTopN = True # seed for reproducibility np.random.seed(0) random.seed(0) # for expanded display in pandas pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) pd.set_option('display.width', None) pd.set_option('display.max_colwidth', None) # initialize the algorithms before comparison SVD_Algorithm = SVD(random_state=10) # SVDpp_Algorithm = SVDpp(random_state=10) Normal_Predictor = NormalPredictor() # creating the comparison set algo_comparison_set = [(SVD_Algorithm, "SVD"), (Normal_Predictor, "Normal")] # set data recommenderData = RecommenderData(ratingsPath, moviesPath, verbose=True) # set comparer recommenderComparer = RecommenderComparer(recommenderData, algo_comparison_set) # compare comparison = recommenderComparer.Compare(doTopN, verbose=True, sample_topN_for_userIDs=Test_userIDs) # comparison["0000"] = {"sample_topn": }
data6 = Dataset.load_from_file(file_path6, reader=reader) # sample random trainset and testset # test set is made of 25% of the ratings. trainset6, testset6 = train_test_split(data6, test_size=.25) # Choose the algo to use to compute RMSE algo = SVD() algo = BaselineOnly() algo = KNNBasic() algo = SlopeOne() algo = CoClustering() algo = SVDpp() algo = NMF() algo = NormalPredictor() # Train the algorithm on the trainset, and predict ratings for the testset start = time.time() algo.fit(trainset6) predictions = algo.test(testset6) accuracy.rmse(predictions) end = time.time() elapsed = end - start print(elapsed) # Then compute RMSE accuracy.rmse(predictions) predictions = algo.fit(trainset6).test(testset6) predictions
f2 = plt.figure(2) plt.plot(ratings_per_movie, 'o', color = 'blue') plt.ylabel('Number of ratings per movie') plt.xlabel('Item ID') f3 = plt.figure(3) plt.plot(ratings_per_user, 'o', color = 'red') plt.ylabel('Number of ratings per user') plt.xlabel('User ID') plt.show() # Model 1: Random # Create model object model_random = NormalPredictor() print('Model creation successful!') # Train on data using cross-validation with k=5 folds, measuring the RMSE model_random_results = cross_validate(model_random, data, measures=['RMSE'], cv=5, verbose=True) print('Model training successful!') # Model 2: User-Based Collaborative Filtering # Create model object model_user = KNNBasic(sim_options={'user_based': True}) print('Model creation successful!') # Train on data using cross-validation with k=5 folds, measuring the RMSE # Note, this may have a lot of print output # You can set verbose=False to prevent this from happening model_user_results = cross_validate(model_user, data, measures=['RMSE'], cv=5, verbose=True)
# After execute the testFactor to derive the approprate factor, run the surpriseLab to gain the result of 5 fold cross-validation #from the Random, KNN, and SVD The parameter set is get according to the testFactor. dataDir = ("ml-100k/") reader = Reader('ml-100k') train_file = dataDir + 'u%d.base' test_file = dataDir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() algo3 = SVD() algo2 = KNNBasic(user_based=False, k=17) algo1 = NormalPredictor() i = 0 for trainset, testset in pkf.split(data): i += 1 print("Random", i) # train and test algorithm. algo1.fit(trainset) predictions = algo1.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) accuracy.mae(predictions, verbose=True) print("KNN", i)
''' #algo = BaselineOnly(bsl_options=bsl_options) #algo = BaselineOnly() ''' Estimating biases using als... RMSE: 0.8657 Estimating biases using als... RMSE: 0.8662 Estimating biases using als... RMSE: 0.8659 user: 196 item: 302 r_ui = 4.00 est = 4.19 {'was_impossible': False} ''' algo = NormalPredictor() ''' RMSE: 1.4326 RMSE: 1.4333 RMSE: 1.4316 user: 196 item: 302 r_ui = 4.00 est = 4.84 {'was_impossible': False} ''' # 定义K折交叉验证迭代器,K=3 ''' 交叉验证(Cross Validation)为CV。 基本思想:将原始数据进行分组,一部分作为训练集,另一部分作为测试集,首先用训练集对分类器进行训练,再利用验证集来测试训练 得到的模型,以此作为评价分类器的性能指标。 Kfold: 原始数据分成K组(一般是均分),将每个子集数据分别做一次验证集,其余的k-1组子集数据作为训练集, 这样会得到k个模型,用这k个模型最终的验证集的分类准确率的平均数作为此K-CV下分类器的性能指标。
def load_movielens(): ml = MovieLens() print("Loading movie ratings...") data = ml.load() print("\nComputing movie popularity ranks so we can measure novelty later...") rankings = ml.get_popularity_ranks() return ml, data, rankings if __name__ == '__main__': # Load up common data set for the recommender algorithms ml, data, rankings = load_movielens() # Construct an Evaluator to, you know, evaluate them evaluator = Evaluator(data, rankings) # User-based KNN user_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) evaluator.add_algorithm(user_knn, "User KNN") # Item-based KNN item_knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) evaluator.add_algorithm(item_knn, "Item KNN") # Just make random recommendations evaluator.add_algorithm(NormalPredictor(), "Random") evaluator.evaluate(False) evaluator.sample_topn_recs(ml)
from surprise import NormalPredictor, BaselineOnly, accuracy, KNNBasic, KNNWithMeans, KNNBaseline, SVD from surprise.model_selection import train_test_split, GridSearchCV, KFold import random import pandas as pd import numpy as np np.random.seed(0) random.seed(0) pd.set_option('display.max_columns', 500) data, items, ratings = GetBookData(density_filter=True) trainset, testset = train_test_split(data, test_size=0.2) results = {} top_n = {} ###Normal Predictor norm = NormalPredictor() norm.fit(trainset) norm_pred = norm.test(testset) rmse = accuracy.rmse(norm_pred) precisions, recalls = precision_recall_at_k(norm_pred, k=10, threshold=4.5) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall = sum(rec for rec in recalls.values()) / len(recalls) metrics = { 'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall } results['NormalPredictor'] = metrics top_n['NormalPredictor'] = get_top_n(norm_pred, n=10)