def test_cross_validate(toy_data): # First test with a specified CV iterator. current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() pkf = ms.PredefinedKFold() ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf, verbose=1) # Basically just test that keys (dont) exist as they should assert len(ret['test_rmse']) == 1 assert len(ret['test_mae']) == 1 assert len(ret['fit_time']) == 1 assert len(ret['test_time']) == 1 assert 'test_fcp' not in ret assert 'train_rmse' not in ret assert 'train_mae' not in ret # Test that 5 fold CV is used when cv=None # Also check that train_* key exist when return_train_measures is True. ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None, return_train_measures=True, verbose=True) assert len(ret['test_rmse']) == 5 assert len(ret['test_mae']) == 5 assert len(ret['fit_time']) == 5 assert len(ret['test_time']) == 5 assert len(ret['train_rmse']) == 5 assert len(ret['train_mae']) == 5
def test_user_based_field(u1_ml100k, pkf): """Ensure that the user_based field is taken into account (only) when needed.""" algorithms = (KNNBasic, KNNWithMeans, KNNBaseline) for klass in algorithms: algo = klass(sim_options={'user_based': True}) rmses_user_based = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] algo = klass(sim_options={'user_based': False}) rmses_item_based = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmses_user_based != rmses_item_based
def test_SVDpp_parameters(u1_ml100k, pkf): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVDpp(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] # n_factors algo = SVDpp(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_factors # The rest is OK but just takes too long for now... """
def trainer(self): # Set the random seed that numpy (used internally by Surprise) will use. my_seed = random.randint(0, 2**32) random.seed(my_seed) numpy.random.seed(my_seed) # Reassurance that the script is actually running. self.printer( "\nNow training on the MovieLens latest small dataset. (8 folds used)" ) self.printer("Please wait...\n") # Define the file's format reader = Reader(line_format='user item rating timestamp', sep=',') # Load the data from the ratings.csv file data = Dataset.load_from_file('./ml-latest-small/ratings.csv', reader=reader) # Use the SVD algorithm for prediction method = SVD() start = time.time() # Use 8-fold cross validation and evaluate the results with RMSE and MAE measurements = cross_validate(method, data, measures=['RMSE', 'MAE'], cv=8, verbose=False, n_jobs=-2, return_train_measures=True) # Print the random seed used for fold assignments self.printer( "Random seed used for fold assignment: {}\n".format(my_seed)) # Show the stats meanFitTime = numpy.mean(measurements["fit_time"]) meanTestTime = numpy.mean(measurements["test_time"]) meanTestMAE = numpy.mean(measurements["test_mae"]) meanTestRMSE = numpy.mean(measurements["test_rmse"]) meanTrainMAE = numpy.mean(measurements["train_mae"]) meanTrainRMSE = numpy.mean(measurements["train_rmse"]) self.printer( "Mean fit time per fold: {:0.5f} seconds".format(meanFitTime)) self.printer( "Mean test time per fold: {:0.5f} seconds".format(meanTestTime)) self.printer("Mean train MAE per fold: {:0.5f}".format(meanTrainMAE)) self.printer("Mean train RMSE per fold: {:0.5f}".format(meanTrainRMSE)) self.printer("Mean test MAE per fold: {:0.5f}".format(meanTestMAE)) self.printer("Mean test RMSE per fold: {:0.5f}\n".format(meanTestRMSE)) # Train with the dataset trainset = data.build_full_trainset() method.fit(trainset) end = time.time() spent = end - start self.printer( "Training and testing time: {:0.3f} seconds\n".format(spent)) process = psutil.Process(os.getpid()) self.printer("Memory used:") self.printer("{:0.5f}".format(process.memory_info().rss / 1048576.0) + " MB Physical") self.printer("{:0.5f}".format(process.memory_info().vms / 1048576.0) + " MB Virtual") return method, trainset
def test_shrinkage_field(u1_ml100k, pkf): """Ensure the shrinkage field is taken into account.""" sim_options = {'name': 'pearson_baseline', 'shrinkage': 0 } bsl_options = {'n_epochs': 1} algo = KNNBasic(sim_options=sim_options) rmse_shrinkage_0 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'pearson_baseline', 'shrinkage': 100 } bsl_options = {'n_epochs': 1} algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options) rmse_shrinkage_100 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_shrinkage_0 != rmse_shrinkage_100
def test_sgd_reg_field(): """Ensure the reg field is taken into account.""" bsl_options = {'method': 'sgd', 'n_epochs': 1, 'reg': 0.02, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_reg_002 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'sgd', 'n_epochs': 1, 'reg': 1, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_reg_1 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_sgd_reg_002 != rmse_sgd_reg_1
def repeat(algo_type, frame, min_, max_): reader = Reader(rating_scale=(min_,max_)) data = Dataset.load_from_df(frame, reader=reader) algo = algo_type print(cross_validate(algo, data, measures=['RMSE','MAE'], cv = 3, verbose = True)) user_id = 'A3R5OBKS7OM2IR' movie_id = 'Movie1' rating = 5.0 algo.predict(user_id, movie_id, r_ui=rating, verbose = True)
def test_als_n_epochs_field(): """Ensure the n_epochs field is taken into account.""" bsl_options = {'method': 'als', 'n_epochs': 1, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_n_epochs_1 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'als', 'n_epochs': 5, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_n_epochs_5 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_als_n_epochs_1 != rmse_als_n_epochs_5
def test_sgd_learning_rate_field(): """Ensure the learning_rate field is taken into account.""" bsl_options = {'method': 'sgd', 'n_epochs': 1, 'learning_rate': .005, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_lr_005 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] bsl_options = {'method': 'sgd', 'n_epochs': 1, 'learning_rate': .00005, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_lr_00005 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_sgd_lr_005 != rmse_sgd_lr_00005
def do_predict(): ratings_dic = { "userId": userGroupId, "itemId": ingredientId, "rating": ratings } df = pd.DataFrame(ratings_dic) reader = Reader(rating_scale=(1, 4)) data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) cross_validate( algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Root Mean Square Error # Mean absolute error get_top_n(predictions)
def getResultFilerCollaborative(self): # instaciamos un lector reader = Reader() # Instanciamos la descomposicion de valor singular svd = SVD() # Obtenemos los datos de la base de datos #self.ratings_data = pd.read_csv('calificaciones.csv') # obtenemos un dataset conformado por valores como id_usuario, id_cuestionario, calificacion. data = Dataset.load_from_df( self.ratings_data[['id_user', 'id_quest', 'calificacion']], reader) # Obtenemos un conjunto de prueba #trainset = data.build_full_trainset() # Entrenemos el algoritmo con el conjunto de pruebas #svd.fit(trainset) # evaluamos la predicciones con RMSE Y MAE cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # retornamos return svd
def test_sgd_n_epoch_field(): """Ensure the n_epoch field is taken into account.""" bsl_options = {'method': 'sgd', 'n_epochs': 1, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_1 = \ cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse'] bsl_options = {'method': 'sgd', 'n_epochs': 20, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_5 = \ cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse'] assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
def objective(**params): print(params) svd_algo = SVD(**params, random_state=8) results = cross_validate(svd_algo, data, measures=['rmse'], cv=5, n_jobs=-1) return np.mean(results['test_rmse'])
def test_als_reg_i_field(): """Ensure the reg_i field is taken into account.""" bsl_options = {'method': 'als', 'reg_i': 0, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_0 = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse'] bsl_options = {'method': 'als', 'reg_i': 10, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_10 = \ cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse'] assert rmse_als_regi_0 != rmse_als_regi_10
def test_cross_validate(): # First test with a specified CV iterator. current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) algo = NormalPredictor() pkf = ms.PredefinedKFold() ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf, verbose=1) # Basically just test that keys (dont) exist as they should assert len(ret['test_rmse']) == 1 assert len(ret['test_mae']) == 1 assert len(ret['fit_time']) == 1 assert len(ret['test_time']) == 1 assert 'test_fcp' not in ret assert 'train_rmse' not in ret assert 'train_mae' not in ret # Test that 5 fold CV is used when cv=None # Also check that train_* key exist when return_train_measures is True. data = Dataset.load_from_file(current_dir + '/custom_dataset', reader) ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=None, return_train_measures=True, verbose=True) assert len(ret['test_rmse']) == 5 assert len(ret['test_mae']) == 5 assert len(ret['fit_time']) == 5 assert len(ret['test_time']) == 5 assert len(ret['train_rmse']) == 5 assert len(ret['train_mae']) == 5
def model(data, datacsv): """ 建立模型 :return: """ #使用SVD模型 algo = SVD() #进行5折交叉验证 # print(cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)) print(cross_validate(NormalPredictor(), datacsv, cv=2))
def five_fold(df): reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user id', 'movie id', 'rating']], reader) algo = SVD() out = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) return algo.fit(data.build_full_trainset())
def KNN_best(data, opt=True): sim_options = { "name": "msd", "min_support": 3, "user_based": opt } algo = KNNWithMeans(sim_options=sim_options) pred = cross_validate(algo, data, measures=['mse'], cv=5, verbose=True) mean = np.mean(pred['test_mse']) return mean
def SVDpp_calculation(data , trainset, testset, time, cv): start = time.time() algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True) end = time.time() time = end-start return time, cross_validate_svdpp_dict
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
def test_sgd_n_epoch_field(u1_ml100k, pkf): """Ensure the n_epoch field is taken into account.""" bsl_options = { 'method': 'sgd', 'n_epochs': 1, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_1 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] bsl_options = { 'method': 'sgd', 'n_epochs': 20, } algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd_n_epoch_5 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
def test_method_field(): """Ensure the method field is taken into account.""" bsl_options = {'method': 'als'} algo = BaselineOnly(bsl_options=bsl_options) rmse_als = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse'] bsl_options = {'method': 'sgd'} algo = BaselineOnly(bsl_options=bsl_options) rmse_sgd = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse'] assert rmse_als != rmse_sgd with pytest.raises(ValueError): bsl_options = {'method': 'wrong_name'} algo = BaselineOnly(bsl_options=bsl_options) cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[ 'test_neg_rmse']
def evaluate(self, test_size=.25): from surprise.model_selection import cross_validate, train_test_split from surprise import accuracy recommendation_dataset = RecommendationsDataset() cross_validate(self.algorithm, recommendation_dataset.dataset, measures=['RMSE', 'MSE'], cv=5, verbose=True) train, test = train_test_split(recommendation_dataset.dataset) # train.ur # train.ir # test self.fit(train) test_predictions = self.test(test) # result print("MAE: ", accuracy.mae(test_predictions, verbose=0)) print("RMSE: ", accuracy.rmse(test_predictions, verbose=0))
def test_als_reg_i_field(u1_ml100k, pkf): """Ensure the reg_i field is taken into account.""" bsl_options = { 'method': 'als', 'reg_i': 0, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_0 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] bsl_options = { 'method': 'als', 'reg_i': 10, } algo = BaselineOnly(bsl_options=bsl_options) rmse_als_regi_10 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_als_regi_0 != rmse_als_regi_10
def worker(self, algo, sema): sema.acquire() print("worker :: "+algo) A = self.algo_map.get(algo) data_full = pd.read_csv(self.city+'-reviews-user-business.csv') data_required = data_full[['user_id', 'business_id', 'stars']] reader = Reader(rating_scale=(1.0, 5.0)) data = Dataset.load_from_df(data_required, reader) cv_results = cross_validate(A.algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=1, verbose=False) res_df = pd.DataFrame.from_dict(cv_results).mean(axis=0) self.return_dict[algo]=res_df sema.release()
def test_CoClustering_parameters(u1_ml100k, pkf): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = CoClustering(n_epochs=1, random_state=1) rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] # n_cltr_u algo = CoClustering(n_cltr_u=1, n_epochs=1, random_state=1) rmse_n_cltr_u = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_cltr_u # n_cltr_i algo = CoClustering(n_cltr_i=1, n_epochs=1, random_state=1) rmse_n_cltr_i = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_cltr_i # n_epochs algo = CoClustering(n_epochs=2, random_state=1) rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_epochs
def cv_multiple_models(data, models_dict, cv=3): results = pd.DataFrame() for model_name, model in models_dict.items(): print('\n---> CV for %s...' % model_name) cv_results = cross_validate(model, data, cv=cv) tmp = pd.DataFrame(cv_results).mean() tmp['model'] = model_name results = results.append(tmp, ignore_index=True) return results
def collaborativeFiltering(): # Data Collection df1=pd.read_csv('tmdb_5000_credits.csv') df2=pd.read_csv('tmdb_5000_movies.csv') # Joining the two datasets on 'id' column df1.columns=['id','title','cast','crew'] df2=df2.merge(df1.drop('title',axis=1),on='id') ratings = pd.read_csv('ml-latest-small/ratings.csv') reader=Reader() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],reader) # data.split(n_folds=5) svd = SVD() # Run 5-fold cross-validation and then print results cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5) # here the root mean square error is 0.869- which is great! # training our dataset now trainset = data.build_full_trainset() svd.fit(trainset)
def test_CoClustering_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = CoClustering(n_epochs=1, random_state=1) rmse_default = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] # n_cltr_u algo = CoClustering(n_cltr_u=1, n_epochs=1, random_state=1) rmse_n_cltr_u = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_cltr_u # n_cltr_i algo = CoClustering(n_cltr_i=1, n_epochs=1, random_state=1) rmse_n_cltr_i = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_cltr_i # n_epochs algo = CoClustering(n_epochs=2, random_state=1) rmse_n_epochs = cross_validate(algo, data, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_epochs
def Recommendation_MatrixFact_SVD(trainset, testset, dataset): algo_svd = SVD() #with default parameters 'n_epochs=20, lr_all=0.005, reg_all=0.02 fit_svd = algo_svd.fit(trainset) predictions = fit_svd.test(testset) print("Matrix Factorization SVD Prediction Accuracy : " , accuracy.rmse(predictions, verbose=False)) print(" ") print("-----ACCURACY using SVD------") print(" ") #print("Evaluate: ", evaluate(algo_svd, dataset, measures=['RMSE'])) print(cross_validate(algo_svd, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)) print(" ") return predictions
def train_nmf(data): rmse = [] mae = [] sim_options = {'name': 'pearson'} for k in range(2, 52, 2): print("using k = %d" % k) nmf = NMF(n_factors=k) temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10) rmse.append(np.mean(temp['test_rmse'])) mae.append(np.mean(temp['test_mae'])) print("k-fold validation finished!") return (rmse, mae)
def train_knn(data): rmse = [] mae = [] sim_options = {'name': 'pearson'} for k in range(2, 102, 2): print("using k = %d" % k) knn = KNNWithMeans(k=k, sim_options=sim_options) temp = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=10) rmse.append(np.mean(temp['test_rmse'])) mae.append(np.mean(temp['test_mae'])) print("k-fold validation finished!") return (rmse, mae)
def get_predictions(): """ Get all the predictions and print it on screen :return: """ reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_file(dataset, reader=reader) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=False) top_n = get_top_n(predictions, n=10) return top_n
def compare_models(data): # Define probabilistic matrix factorization algorithm matrix_fact = prediction_algorithms.matrix_factorization.SVD(biased=False) # Define user-based collaborative filtering algorithm sim_options = {'name': 'cosine', 'user_based': True} user_based = KNNBasic(sim_options=sim_options, verbose=False) # Define item-based collaborative filtering algorithm sim_options = {'name': 'cosine', 'user_based': False} item_based = KNNBasic(sim_options=sim_options, verbose=False) # Run 5-fold cross validation on each algorithm and print results. cross_validate(algo=matrix_fact, data=data, measures=['rmse', 'mae'], cv=5, verbose=True) cross_validate(algo=user_based, data=data, measures=['rmse', 'mae'], cv=5, verbose=True) cross_validate(algo=item_based, data=data, measures=['rmse', 'mae'], cv=5, verbose=True)
def test_name_field(u1_ml100k, pkf): """Ensure the name field is taken into account.""" sim_options = {'name': 'cosine'} algo = KNNBasic(sim_options=sim_options) rmse_cosine = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'msd'} algo = KNNBasic(sim_options=sim_options) rmse_msd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'pearson'} algo = KNNBasic(sim_options=sim_options) rmse_pearson = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'pearson_baseline'} bsl_options = {'n_epochs': 1} algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options) rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] for rmse_a, rmse_b in combinations( (rmse_cosine, rmse_msd, rmse_pearson, rmse_pearson_bsl), 2): assert (rmse_a != rmse_b) with pytest.raises(NameError): sim_options = {'name': 'wrong_name'} algo = KNNBasic(sim_options=sim_options) cross_validate(algo, u1_ml100k, ['rmse'], pkf)
def test_name_field(u1_ml100k, pkf): """Ensure the name field is taken into account.""" sim_options = {'name': 'cosine'} algo = KNNBasic(sim_options=sim_options) rmse_cosine = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'msd'} algo = KNNBasic(sim_options=sim_options) rmse_msd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'pearson'} algo = KNNBasic(sim_options=sim_options) rmse_pearson = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] sim_options = {'name': 'pearson_baseline'} bsl_options = {'n_epochs': 1} algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options) rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson, rmse_pearson_bsl), 2): assert (rmse_a != rmse_b) with pytest.raises(NameError): sim_options = {'name': 'wrong_name'} algo = KNNBasic(sim_options=sim_options) cross_validate(algo, u1_ml100k, ['rmse'], pkf)
def test_gridsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one giving the best score (by re-running it)""" param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]} gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(u1_ml100k) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def test_randomizedsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one that gives the best score (by re-running it)""" param_distributions = {'n_epochs': [5], 'lr_all': uniform(0.002, 0.003), 'reg_all': uniform(0.04, 0.02), 'n_factors': [1], 'init_std_dev': [0]} rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) rs.fit(u1_ml100k) best_estimator = rs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == rs.best_score['mae']
from surprise import SVD from surprise import Dataset, print_perf from surprise.model_selection import cross_validate # 默认载入movielens数据集 data = Dataset.load_builtin('ml-100k') algo = SVD() # 在数据集上测试一下效果 perf = cross_validate(algo, data, measures=['RMSE'], cv=3)# RMSE(均方根误差) #输出结果 print_perf(perf)
""" This module describes the most basic usage of Surprise: you define a prediction algorithm, (down)load a dataset and run a cross-validation procedure. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') # We'll use the famous SVD algorithm. algo = SVD() # Run 5-fold cross-validation and print results cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
def test_SVD_parameters(u1_ml100k, pkf): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVD(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] # n_factors algo = SVD(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_factors # n_epochs algo = SVD(n_factors=1, n_epochs=2, random_state=1) rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_epochs # biased algo = SVD(n_factors=1, n_epochs=1, biased=False, random_state=1) rmse_biased = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_biased # lr_all algo = SVD(n_factors=1, n_epochs=1, lr_all=5, random_state=1) rmse_lr_all = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_all # reg_all algo = SVD(n_factors=1, n_epochs=1, reg_all=5, random_state=1) rmse_reg_all = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_all # lr_bu algo = SVD(n_factors=1, n_epochs=1, lr_bu=5, random_state=1) rmse_lr_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_bu # lr_bi algo = SVD(n_factors=1, n_epochs=1, lr_bi=5, random_state=1) rmse_lr_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_bi # lr_pu algo = SVD(n_factors=1, n_epochs=1, lr_pu=5, random_state=1) rmse_lr_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_pu # lr_qi algo = SVD(n_factors=1, n_epochs=1, lr_qi=5, random_state=1) rmse_lr_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_qi # reg_bu algo = SVD(n_factors=1, n_epochs=1, reg_bu=5, random_state=1) rmse_reg_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_bu # reg_bi algo = SVD(n_factors=1, n_epochs=1, reg_bi=5, random_state=1) rmse_reg_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_bi # reg_pu algo = SVD(n_factors=1, n_epochs=1, reg_pu=5, random_state=1) rmse_reg_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_pu # reg_qi algo = SVD(n_factors=1, n_epochs=1, reg_qi=5, random_state=1) rmse_reg_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_qi
from surprise import SVD from surprise import Dataset, print_perf, Reader from surprise.model_selection import cross_validate import os # 指定文件所在路径 file_path = os.path.expanduser('mydata.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) algo = SVD() # 在数据集上测试一下效果 perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3) #输出结果 print_perf(perf)
'http://grouplens.org/datasets/movielens/1m'), } # set RNG np.random.seed(0) random.seed(0) dataset = 'ml-1m' data = Dataset.load_builtin(dataset) kf = KFold(random_state=0) # folds will be the same for all algorithms. table = [] for klass in classes: start = time.time() out = cross_validate(klass(), data, ['rmse', 'mae'], kf) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) link = LINK[klass.__name__] mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) mean_mae = '{:.3f}'.format(np.mean(out['test_mae'])) new_line = [link, mean_rmse, mean_mae, cv_time] print(tabulate([new_line], tablefmt="pipe")) # print current algo perf table.append(new_line) header = [LINK[dataset], 'RMSE', 'MAE', 'Time' ] print(tabulate(table, header, tablefmt="pipe"))
""" This module descibes how to load a dataset from a pandas dataframe. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise.model_selection import cross_validate # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) # We can now use this dataset as we please, e.g. calling cross_validate cross_validate(NormalPredictor(), data, cv=2)
def main(): class MyParser(argparse.ArgumentParser): '''A parser which prints the help message when an error occurs. Taken from http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.''' # noqa def error(self, message): sys.stderr.write('error: %s\n' % message) self.print_help() sys.exit(2) parser = MyParser( description='Evaluate the performance of a rating prediction ' + 'algorithm ' + 'on a given dataset using cross validation. You can use a built-in ' + 'or a custom dataset, and you can choose to automatically split the ' + 'dataset into folds, or manually specify train and test files. ' + 'Please refer to the documentation page ' + '(http://surprise.readthedocs.io/) for more details.', epilog="""Example:\n surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}" -load-builtin ml-100k -n-folds 3""") algo_choices = { 'NormalPredictor': NormalPredictor, 'BaselineOnly': BaselineOnly, 'KNNBasic': KNNBasic, 'KNNBaseline': KNNBaseline, 'KNNWithMeans': KNNWithMeans, 'SVD': SVD, 'SVDpp': SVDpp, 'NMF': NMF, 'SlopeOne': SlopeOne, 'CoClustering': CoClustering, } parser.add_argument('-algo', type=str, choices=algo_choices, help='The prediction algorithm to use. ' + 'Allowed values are ' + ', '.join(algo_choices.keys()) + '.', metavar='<prediction algorithm>') parser.add_argument('-params', type=str, metavar='<algorithm parameters>', default='{}', help='A kwargs dictionary that contains all the ' + 'algorithm parameters.' + 'Example: "{\'n_epochs\': 10}".' ) parser.add_argument('-load-builtin', type=str, dest='load_builtin', metavar='<dataset name>', default='ml-100k', help='The name of the built-in dataset to use.' + 'Allowed values are ' + ', '.join(dataset.BUILTIN_DATASETS.keys()) + '. Default is ml-100k.' ) parser.add_argument('-load-custom', type=str, dest='load_custom', metavar='<file path>', default=None, help='A file path to custom dataset to use. ' + 'Ignored if ' + '-loadbuiltin is set. The -reader parameter needs ' + 'to be set.' ) parser.add_argument('-folds-files', type=str, dest='folds_files', metavar='<train1 test1 train2 test2... >', default=None, help='A list of custom train and test files. ' + 'Ignored if -load-builtin or -load-custom is set. ' 'The -reader parameter needs to be set.' ) parser.add_argument('-reader', type=str, metavar='<reader>', default=None, help='A Reader to read the custom dataset. Example: ' + '"Reader(line_format=\'user item rating timestamp\',' + ' sep=\'\\t\')"' ) parser.add_argument('-n-folds', type=int, dest='n_folds', metavar="<number of folds>", default=5, help='The number of folds for cross-validation. ' + 'Default is 5.' ) parser.add_argument('-seed', type=int, metavar='<random seed>', default=None, help='The seed to use for RNG. ' + 'Default is the current system time.' ) parser.add_argument('--with-dump', dest='with_dump', action='store_true', help='Dump the algorithm ' + 'results in a file (one file per fold). ' + 'Default is False.' ) parser.add_argument('-dump-dir', dest='dump_dir', type=str, metavar='<dir>', default=None, help='Where to dump the files. Ignored if ' + 'with-dump is not set. Default is ' + os.path.join(get_dataset_dir(), 'dumps/') ) parser.add_argument('--clean', dest='clean', action='store_true', help='Remove the ' + get_dataset_dir() + ' directory and exit.' ) parser.add_argument('-v', '--version', action='version', version=__version__) args = parser.parse_args() if args.clean: folder = get_dataset_dir() shutil.rmtree(folder) print('Removed', folder) exit() # setup RNG rd.seed(args.seed) np.random.seed(args.seed) # setup algorithm params = eval(args.params) if args.algo is None: parser.error('No algorithm was specified.') algo = algo_choices[args.algo](**params) # setup dataset if args.load_custom is not None: # load custom and split if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) data = Dataset.load_from_file(args.load_custom, reader=reader) cv = KFold(n_splits=args.n_folds, random_state=args.seed) elif args.folds_files is not None: # load from files if args.reader is None: parser.error('-reader parameter is needed.') reader = eval(args.reader) folds_files = args.folds_files.split() folds_files = [(folds_files[i], folds_files[i + 1]) for i in range(0, len(folds_files) - 1, 2)] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) cv = PredefinedKFold() else: # load builtin dataset and split data = Dataset.load_builtin(args.load_builtin) cv = KFold(n_splits=args.n_folds, random_state=args.seed) cross_validate(algo, data, cv=cv, verbose=True)
def test_NMF_parameters(u1_ml100k, pkf): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = NMF(n_factors=1, n_epochs=1, random_state=1) rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] # n_factors algo = NMF(n_factors=2, n_epochs=1, random_state=1) rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_factors # n_epochs algo = NMF(n_factors=1, n_epochs=2, random_state=1) rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_n_epochs # biased algo = NMF(n_factors=1, n_epochs=1, biased=True, random_state=1) rmse_biased = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_biased # reg_pu algo = NMF(n_factors=1, n_epochs=1, reg_pu=1, random_state=1) rmse_reg_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_pu # reg_qi algo = NMF(n_factors=1, n_epochs=1, reg_qi=1, random_state=1) rmse_reg_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_qi # reg_bu algo = NMF(n_factors=1, n_epochs=1, reg_bu=1, biased=True, random_state=1) rmse_reg_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_bu # reg_bi algo = NMF(n_factors=1, n_epochs=1, reg_bi=1, biased=True, random_state=1) rmse_reg_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_reg_bi # lr_bu algo = NMF(n_factors=1, n_epochs=1, lr_bu=1, biased=True, random_state=1) rmse_lr_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_bu # lr_bi algo = NMF(n_factors=1, n_epochs=1, lr_bi=1, biased=True, random_state=1) rmse_lr_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_lr_bi # init_low algo = NMF(n_factors=1, n_epochs=1, init_low=.5, random_state=1) rmse_init_low = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_init_low # init_low with pytest.raises(ValueError): algo = NMF(n_factors=1, n_epochs=1, init_low=-1, random_state=1) # init_high algo = NMF(n_factors=1, n_epochs=1, init_high=.5, random_state=1) rmse_init_high = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse'] assert rmse_default != rmse_init_high
from surprise import Dataset from surprise.model_selection import cross_validate class MyOwnAlgorithm(AlgoBase): def __init__(self): # Always call base method before doing anything. AlgoBase.__init__(self) def estimate(self, u, i): sum_means = self.trainset.global_mean div = 1 if self.trainset.knows_user(u): sum_means += np.mean([r for (_, r) in self.trainset.ur[u]]) div += 1 if self.trainset.knows_item(i): sum_means += np.mean([r for (_, r) in self.trainset.ir[i]]) div += 1 return sum_means / div data = Dataset.load_builtin('ml-100k') algo = MyOwnAlgorithm() cross_validate(algo, data, verbose=True)