def try_recom_algorithm_grid(data, algo, filename, grid_options, n_splits=5): """ Function that tries out the recommendation algorithms supported by Surprise library, but first it tunes the hyperparameters using grid search :param data: input data containing user, item, rating and timestamp(opt) :param algo: the recom. algorithm to be used :param filename: name of the file the results should be saved into :param grid_options: dictionary containing possible values range for each parameter :param n_splits: number of folds for the cross validation :return: """ print("\nWorking on " + filename + "\n") file = open("../results_surprise_163K/" + filename + ".txt", "w+") # use grid search cross validation using the given grid options gs = GridSearchCV(algo, grid_options, measures=['rmse', 'mae'], cv=n_splits) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) file.write("RMSE: %f" % (gs.best_score['rmse'])) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) file.write("Best params:") file.write(str(gs.best_params['rmse'])) file.close()
def tune_and_find_parameter(self,algo_name, algo, rating_data,param_grid): """ use GridSearchCVcomputes which (from surpise documentation) computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. Args: param1: algo_name : the name of the algorithm param2: algo: the algorithm itself param3: rating_data: the whole dataset Return:best k found """ print("tuning for", algo_name, "hyperparameters") # algo: algo class name grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae']) grid_search.fit(rating_data) print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse']) best_params = grid_search.best_params['rmse'] # print the best set of parameters print("best params:", best_params) return best_params
def perform_grid_search_with_cv(self, train_set): """ Perform grid search to get optimal parameters and get metrics after cross validation :param train_set: The train set :return: Different RMSE and MAE for the different hyper parameters """ if train_set: print("Running grid search to find optimal hyper parameters") self.LOG_HANDLE.info( "Running grid search to find optimal hyper parameters") param_grid = { 'k': [30, 40, 50], 'min_k': [1, 3, 5], 'sim_options': { 'name': ['cosine', 'pearson', 'msd'], 'user_based': [False] } } gs = GridSearchCV( KNNWithMeans, param_grid, measures=model_params.all_models_training_error_measures, cv=model_params.cross_validation_folds) gs.fit(train_set) # best RMSE score print("Best RMSE after CV: ") print(gs.best_score['rmse']) self.LOG_HANDLE.info(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print("Best parameters after CV: ") print(gs.best_params['rmse']) self.LOG_HANDLE.info(gs.best_params['rmse'])
def test(): seed = 0 random.seed(seed) np.random.seed(seed) param_grid: Dict[str, List[object]] = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 20, 50], 'biased': [True, False], 'init_mean': [0, 0.1, 0.5], 'init_std_dev': [0, 0.1, 0.5], 'lr_all': [0.001, 0.005, 0.01], 'reg_all': [0.01, 0.02, 0.05], 'random_state': [None], 'verbose': [True] } grid_search = GridSearchCV( algo_class=SVD, param_grid=param_grid, measures=['rmse'], cv=KFold(5), n_jobs=-1 ) interactions = load_sorted_test_interactions() parsed_data = Parser.parse(interactions) grid_search.fit(parsed_data.whole_data_set) print(grid_search.best_score['rmse']) print(grid_search.best_params['rmse']) print(grid_search.cv_results) add_results_to_database(grid_search.cv_results, "svd", cls=NumpyEncoder)
def test_gridsearchcv_same_splits(): """Ensure that all parameter combinations are tested on the same splits (we check their RMSE scores are the same once averaged over the splits, which should be enough). We use as much parallelism as possible.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) # all RMSE should be the same (as param combinations are the same) param_grid = {'n_epochs': [5], 'lr_all': [.2, .2], 'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]} gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf, n_jobs=1) gs.fit(data) rmse_scores = [m for m in gs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal # Note: actually, even when setting random_state=None in kf, the same folds # are used because we use product(param_comb, kf.split(...)). However, it's # needed to have the same folds when calling fit again: gs.fit(data) rmse_scores += [m for m in gs.cv_results['mean_test_rmse']] assert len(set(rmse_scores)) == 1 # assert rmse_scores are all equal
def do_grid_search(data): print("Doing gridsearch for best model.") param_grid = { 'n_epochs': [10, 20, 30], 'n_factors': [100, 150, 200], 'lr_all': [0.001, 0.0025, 0.005, 0.001], 'reg_all': [0.2, 0.4, 0.6] } gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, joblib_verbose=5, n_jobs=-1) gs.fit(data_handler.get_data_from_df(data)) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # We can now use the algorithm that yields the best rmse: algo = gs.best_estimator['rmse'] return algo
def param_selection(self): """ select the best parameter for SVD, using cross-validation :param data: :return: SVD paramters """ tuned_parameters = { 'n_factors': [20, 50, 100], 'reg_all': [0.04, 0.05] } grid_search = GridSearchCV(SVD, tuned_parameters, measures=['rmse', 'mae'], cv=3) grid_search.fit(self.trainset) print("Best parameters using RMSE:") print(grid_search.best_params['rmse']) print() self.n_factors = grid_search.best_params['mae'].get('n_factors') self.reg_all = grid_search.best_params['mae'].get('reg_all') print("Best score using RMSE:") print(grid_search.best_score['rmse']) print() print("Best parameters using MAE:") print(grid_search.best_params['mae']) print() print("Best score using MAE:") print(grid_search.best_score['mae']) print()
def KNN_Tester(trainset, testset, algo): param_grid = { 'k': [50, 100], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'] } } gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5) gs.fit(data) params = gs.best_params['rmse'] algo = KNNBasic(k=params['k'], sim_options=params['sim_options']) algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall = sum(rec for rec in recalls.values()) / len(recalls) metrics = { 'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params } return metrics
def grid_search(self): print('grid search...') sim_options = { "name": ["msd", "cosine"], "min_support": [3, 4], "user_based": [False] } param_grid = { "sim_options": sim_options, "k": [50, 100, 200], "min_k": [1] } gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(self.model_data) best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"] print(f'Best score (RMSE): {best_score}') print(f'Best params (RMSE): {best_params}') print(f'Best score (MAE): {gs.best_score["mae"]}') print(f'Best params (RMSE): {gs.best_params["mae"]}') self.set_model_params(best_params) return best_params
def BaselineOnly_als(): print('Testing BaselineOnly als parameters') param_grid = { 'bsl_options': { 'method': ['als'], 'reg_i': [7, 6.9, 7.1], 'reg_u': [7, 6.9, 7.1] } } gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=10, n_jobs=-2, refit=True) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) #export model joblib.dump(gs.best_params['rmse']['bsl_options'], 'BaselineOnly.pkl', compress=1) dump.dump('BaselineOnly', algo=gs)
def BaselineOnly_sgd(): print('Testing BaselineOnly sgd parameters') param_grid = { 'bsl_options': { 'method': ['sgd'], 'learning_rate': [0.00643, 0.00646, 0.00649], 'n_epochs': [43, 44, 45, 46, 47] } } gs = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=10, n_jobs=-2, refit=True) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) #export model joblib.dump(gs.best_params['rmse']['bsl_options'], 'BaselineOnly.pkl', compress=1) dump.dump('BaselineOnly', algo=gs)
def SVD_alg(): print('Testing SVD parameters') param_grid = { 'n_epochs': [12, 13], 'lr_all': [0.0013, 0.0015], 'reg_all': [0.05, 0.06] } gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=10, n_jobs=-2, refit=True) #runs fit method for all parameter combinations over splits given by cv gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) #export model joblib.dump(gs.best_params['rmse'], 'SVD.pkl', compress=1) dump.dump('SVD', algo=gs)
def get_surprise_knn_item_model(data, trainset, testset, model_train_evaluation, model_test_evaluation, error_table): param_grid = { 'sim_options': { 'name': ["pearson_baseline"], "user_based": [False], "min_support": [2], "shrinkage": [60, 80, 80, 140] }, 'k': [5, 20, 40, 80] } gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) st.write("GRIDSEARCH best scores", gs.best_score['rmse']) st.write("GRIDSEARCH best parameters", gs.best_params['rmse']) sim_options = { 'name': 'pearson_baseline', 'user_based': False, 'min_support': 2, 'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage'] } bsl_options = {'method': 'sgd'} algo = KNNBaseline(k=gs.best_params['rmse']['k'], sim_options=sim_options, bsl_options=bsl_options) train_result, test_result, error_table = run_surprise( algo, trainset, testset, "KNNBaseline_Item", error_table) model_train_evaluation["KNNBaseline_Item"] = train_result model_test_evaluation["KNNBaseline_Item"] = test_result return model_train_evaluation, model_test_evaluation, error_table
def grid_search(data): """ This function was originally used to perform grid search on the different algorithms """ # ---------------------KNN-------------------- #sim_options = { # "name": "mcd", # "min_support": 3, # "user_based": True #} #param_grid = {"sim_options": sim_options} #gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=3) #gs.fit(data) #print(gs.best_score["rmse"]) #print(gs.best_params["rmse"]) # ---------------SVD-------------- param_grid = { "n_epochs": [5, 10, 20], "n_factors": [10, 15, 30] } gs = GridSearchCV(NMF, param_grid, measures=["rmse"], cv=3) gs.fit(data) print(gs.best_score["rmse"]) print(gs.best_params["rmse"])
def load_data(): data = Dataset.load_builtin('ml-100k') # similarity options sim_options = {"name": "msd", "user_based": False} param_grid = { "n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6] } # algorithm algo = KNNWithMeans(sim_options=sim_options) # computation training_set = data.build_full_trainset() algo.fit(training_set) # GRID SEACH, MATRIX FACTORIZATION print("Divide matrix in grids") gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3) gs.fit(data) print(gs.best_score['rmse'])
def modelo_svd_best_n(data): reader = Reader(rating_scale=(1, 5)) # 'lr_all':[0.01,0.002,0.005], #'reg_all':[0.01,0.02,0.04], data = Dataset.load_from_df( data[['userid', 'businessid', 'mean_by_business']], reader) param_grid = { 'n_factors': [5, 20, 50, 100], 'n_epochs': [100, 200, 300], } gs = Gridsearch_svd(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=5) gs.fit(data) # combination of parameters that gave the best RMSE score k = gs.best_params['rmse']['n_factors'] n_epochs = gs.best_params['rmse']['n_epochs'] #Predictions with best parameters data_ = data.build_full_trainset() algo = SVD(n_factors=k, n_epochs=n_epochs) algo.fit(data_) prediciones = algo.test(data_.build_anti_testset()) return prediciones
def find_best_params(data_set, cv=3, param_grid=None): if param_grid is None: param_grid = { 'n_factors': [10, 30, 50], 'n_epochs': [10, 30, 50], 'lr_all': [0.002, 0.005, 0.008, 0.01], 'reg_all': [0.2, 0.4, 0.6, 0.8] } log.info(f'Performing Grid Search: {param_grid}') gs = GridSearchCV(SVD, param_grid=param_grid, measures=['rmse', 'mae'], cv=cv, n_jobs=4, joblib_verbose=2) start_time = time.time() gs.fit(data_set) end_time = time.time() log.info(f'Time spend on Grid Search: {end_time - start_time}') log.info( f"Best RMSE score: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}" ) log.info( f"Best MAE score: {gs.best_score['mae']} with params: {gs.best_params['mae']}" ) return gs.best_params['rmse'], gs.best_params['mae']
def __init__(self, data, score_index, user_index, items_index): self.items_index = items_index self.user_index = user_index self.data = data scale = (data[score_index].min(), data[score_index].max()) reader = Reader(rating_scale=scale) dataset = Dataset.load_from_df( data[[user_index, items_index, score_index]], reader) param_grid = { 'n_factors': [50, 100, 150], 'n_epochs': [25, 50, 75], 'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1, 0.5] } gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) gs.fit(dataset) params = gs.best_params['rmse'] self.model = SVD(reg_all=params['reg_all'], n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all']) self.model.fit(dataset.build_full_trainset())
def tuneHyperParams(algtype, trainset, testset, df, param_grid): """ Tune Hyper Parameters for Surprise library models Args: algtype (surprise.prediction_algorithms): type of the surprise algorithm trainset(pandas.Dataframe) : testset(pandas.Dataframe) : df(pandas.Dataframe) : param_grid : parameters to try Returns: surprise.GridSearchCV: gs """ #TUNE HYPERPARAM VIA GRIDSEARCH reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader) #trainset, testset = train_test_split(data, test_size=.25, random_state=20) gs = GridSearchCV(algtype, param_grid, measures=['rmse'], cv=3) model = gs.fit(data) # best RMSE score #print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score #print(gs.best_params['rmse']) return gs
def test_best_estimator(): """Ensure that the best estimator is the one giving the best score (by re-running it)""" train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) param_grid = { 'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0] } gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(data) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, data, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def tune_and_find_param(self, algo_name, algo, rating_data, param_grid={ 'n_factors': [50, 100], 'n_epochs': [20, 30], 'lr_all': [0.005, 0.010] }): # use GridSearchCVcomputes which (from surpise documentation) # computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure. print("tuning for", algo_name, "hyperparameters") # algo: algo class name grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae']) # fitting data grid_search.fit(rating_data) # print the best RMSE print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse']) best_params = grid_search.best_params['rmse'] # print the best set of parameters print("best params:", best_params) return best_params
def grid_search(): """ grid search template """ # Set Grid Parameters G = gsp.graphs.Graph(dd.build_friend_friend()) G.compute_laplacian('normalized') param_grid = { #'L' : [G.L.todense()], 'n_factors' : [5], 'n_epochs' : [30], 'lr_all' : [1.e-3], 'reg_all' : np.logspace(-6,-1, 20), #'reg' : np.logspace(-6,-1,15) } # Init grid_search grid = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=6, n_jobs=1, joblib_verbose=10000) grid.fit(data) # Print best score and best parameters print('Best Score: ', grid.best_score['rmse']) print('Best parameters: ', grid.best_params['rmse']) # Plot RMSE plt.plot(grid.cv_results['param_reg_all'], grid.cv_results['mean_test_rmse'])
def grid(): raw_ratings = data.raw_ratings threshold = int(.9 * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] data.raw_ratings = A_raw_ratings param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]} grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3) grid_search.fit(data) algo = grid_search.best_estimator['rmse'] # retrain on the whole set A trainset = data.build_full_trainset() algo.fit(trainset) # Compute biased accuracy on A predictions = algo.test(trainset.build_testset()) print('Biased accuracy on A,', end=' ') accuracy.rmse(predictions) # Compute unbiased accuracy on B testset = data.construct_testset(B_raw_ratings) # testset is now the set B predictions = algo.test(testset) print('Unbiased accuracy on B,', end=' ') accuracy.rmse(predictions)
def test_gridsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one giving the best score (by re-running it)""" param_grid = { 'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0] } gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(u1_ml100k) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def tune(self, opt_field='rmse', param_grid={ 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] }, SHOW_RESULT=False): if self.algorithm == 'svd': gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) ## Start tuning gs.fit(self.data) ## Save to self.algo self.algo = gs.best_estimator[opt_field] self.algo.fit(self.trainset) if SHOW_RESULT: # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) return self
def perform_grid_search_with_cv(self, train_set): """ Perform grid search to get optimal parameters and get metrics after cross validation :param train_set: The train set :return: Different RMSE and MAE for the different hyper parameters """ if train_set: print("Running grid search to find optimal hyper parameters") self.LOG_HANDLE.info( "Running grid search to find optimal hyper parameters") param_grid = { 'n_epochs': [10, 20, 30], 'lr_all': [0.005, 0.006, 0.007, 0.008], 'reg_all': [0.01, 0.02, 0.03, 0.2] } gs = GridSearchCV( SVDpp, param_grid, measures=model_params.all_models_training_error_measures, cv=model_params.cross_validation_folds) gs.fit(train_set) # best RMSE score print(gs.best_score['rmse']) self.LOG_HANDLE.info(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) self.LOG_HANDLE.info(gs.best_params['rmse'])
def gridsearch(data, algo, param_grid): # param_grid = {'n_factors': [50, 100, 150], 'n_epochs': [20, 30], # 'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1]} gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3) gs.fit(data) params = gs.best_params['rmse'] print(params)
def best_params(self): param_grid = { 'n_factors': [x for x in range(50, 500, 50)], 'n_epochs': [10, 20, 50, 75, 100], 'lr_all': [.001, .003, .005, .008] } gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) gs.fit(self.data) return gs.best_score['rmse'], gs.best_params['rmse']
def best_params(): # Dataset de reviews a utilizar file_path = os.path.expanduser('./data/reviews_stars.csv') reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_file(file_path, reader=reader) # Se crea una lista de posibles valores de factores n_factors_values = [] n_factors_initial_value = 2 # Se prueban 15 distintos factores en intervalos de 2 for i in range(0, 15): n_factors_values.append(n_factors_initial_value + (n_factors_initial_value * i)) # Se crea una lista de posibles epochs n_epochs_values = [] n_epochs_initial_value = 5 # Se prueba 10 valores distintos en intervalos de 5 for i in range(0, 10): n_epochs_values.append(n_epochs_initial_value + (n_epochs_initial_value * i)) # Se crea una lista de posibles parámetros de regularización reg_all_values = [] reg_all_initial_value = 0.2 # Se prueban 5 valores distintos en intervalos de 0.2 for i in range(0, 5): reg_all_values.append(reg_all_initial_value + (reg_all_initial_value * i)) # Se crea una lista de posibles learning rates lr_all_values = [] lr_all_initial_value = 0.002 # Se prueban 5 valores distintos en intervalos de 0.002 for i in range(0, 5): lr_all_values.append(lr_all_initial_value + (lr_all_initial_value * i)) # Se crea el diccionario de parámetros param_grid = { 'n_factors': n_factors_values, 'n_epochs': n_epochs_values, 'lr_all': lr_all_values, 'reg_all': reg_all_values, 'biased': [True] } # Se prueban los parámetros utilizando MAE y RMSE gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # Se escribe en un archivo los resultados de los mejores parámetros RMSE y MAE with open('./data/results.txt', 'a') as file: file.write('Score rmse: ' + str(gs.best_score['rmse']) + '\n') file.write('Best parameters rmse: ' + str(gs.best_params['rmse']) + '\n') file.write('Score mae: ' + str(gs.best_score['mae']) + '\n') file.write('Best parameters mae: ' + str(gs.best_params['mae']) + '\n')
def recomendacion(usuario): array = [] for rate in Calificacion.objects.all(): array.append([rate.usuario_id, rate.asignatura_id, rate.calificacion]) df = pd.DataFrame(data=array) reader = Reader(rating_scale=(0, 10)) data = Dataset.load_from_df(df, reader) trainingSet = data.build_full_trainset() param_grid = { 'n_factors': [50, 100, 150], "n_epochs": [40, 50, 60], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6] } gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) #Parametros optimos params = gs.best_params["rmse"] SVDoptimized = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) SVDoptimized.fit(trainingSet) asig = Asignatura.objects.all() asig_user = Calificacion.objects.all().filter(usuario_id=usuario.id) #Asignaturas sin calificar asignaturas_SinC = [] for asignatura in asig: encontrado = False for asignatura_usuario in asig_user: if (asignatura_usuario.asignatura_id == asignatura.codigo): encontrado = True if (not encontrado): asignaturas_SinC.append(asignatura) #asignaturas_recomendados asignaturas_rec = [] for asignatura in asignaturas_SinC: asignaturas_rec.append({ 'asignatura': asignatura, 'svd': SVDoptimized.predict(usuario.id, asignatura.codigo).est }) # A function that returns the 'year' value: def ordenador(e): return e['svd'] asignaturas_rec.sort(reverse=True, key=ordenador) return asignaturas_rec
def test_gridsearchcv_cv_results(): """Test the cv_results attribute""" f = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(f, Reader('ml-100k'), rating_scale=(1, 5)) kf = KFold(3, shuffle=True, random_state=4) param_grid = {'n_epochs': [5], 'lr_all': [.2, .2], 'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]} gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'mae'], cv=kf, return_train_measures=True) gs.fit(data) # test keys split*_test_rmse, mean and std dev. assert gs.cv_results['split0_test_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['split1_test_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['split2_test_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['mean_test_rmse'].shape == (4,) # 4 param comb. assert np.allclose(gs.cv_results['mean_test_rmse'], np.mean([gs.cv_results['split0_test_rmse'], gs.cv_results['split1_test_rmse'], gs.cv_results['split2_test_rmse']], axis=0)) assert np.allclose(gs.cv_results['std_test_rmse'], np.std([gs.cv_results['split0_test_rmse'], gs.cv_results['split1_test_rmse'], gs.cv_results['split2_test_rmse']], axis=0)) # test keys split*_train_mae, mean and std dev. assert gs.cv_results['split0_train_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['split1_train_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['split2_train_rmse'].shape == (4,) # 4 param comb. assert gs.cv_results['mean_train_rmse'].shape == (4,) # 4 param comb. assert np.allclose(gs.cv_results['mean_train_rmse'], np.mean([gs.cv_results['split0_train_rmse'], gs.cv_results['split1_train_rmse'], gs.cv_results['split2_train_rmse']], axis=0)) assert np.allclose(gs.cv_results['std_train_rmse'], np.std([gs.cv_results['split0_train_rmse'], gs.cv_results['split1_train_rmse'], gs.cv_results['split2_train_rmse']], axis=0)) # test fit and train times dimensions. assert gs.cv_results['mean_fit_time'].shape == (4,) # 4 param comb. assert gs.cv_results['std_fit_time'].shape == (4,) # 4 param comb. assert gs.cv_results['mean_test_time'].shape == (4,) # 4 param comb. assert gs.cv_results['std_test_time'].shape == (4,) # 4 param comb. assert gs.cv_results['params'] is gs.param_combinations # assert that best parameter in gs.cv_results['rank_test_measure'] is # indeed the best_param attribute. best_index = np.argmin(gs.cv_results['rank_test_rmse']) assert gs.cv_results['params'][best_index] == gs.best_params['rmse'] best_index = np.argmin(gs.cv_results['rank_test_mae']) assert gs.cv_results['params'][best_index] == gs.best_params['mae']
def _perform_grid_search(algo_class: AlgoBase, param_grid: Dict[str, Any], dataset: Dataset, random_state: int) -> pd.DataFrame: gs = GridSearchCV(algo_class, param_grid, measures=['rmse', 'mae', 'fcp'], cv=KFold(5, random_state=random_state), n_jobs=2, joblib_verbose=100, pre_dispatch=2) gs.fit(dataset) return pd.DataFrame.from_dict(gs.cv_results).sort_values('rank_test_rmse')
def test_gridsearchcv_best_estimator(u1_ml100k): """Ensure that the best estimator is the one giving the best score (by re-running it)""" param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]} gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(u1_ml100k) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
parameter combination of a given algorithm. """ from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import SVD from surprise import Dataset from surprise.model_selection import GridSearchCV # Use movielens-100K data = Dataset.load_builtin('ml-100k') param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # We can now use the algorithm that yields the best rmse: algo = gs.best_estimator['rmse'] algo.fit(data.build_full_trainset()) import pandas as pd # noqa results_df = pd.DataFrame.from_dict(gs.cv_results)
def test_gridsearchcv_refit(u1_ml100k): """Test refit function of GridSearchCV.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k'), rating_scale=(1, 5)) param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [2]} # assert gs.fit() and gs.test will use best estimator for mae (first # appearing in measures) gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) mae_preds = gs.best_estimator['mae'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == mae_preds # assert gs.fit() and gs.test will use best estimator for rmse gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit='rmse') gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) rmse_preds = gs.best_estimator['rmse'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == rmse_preds # test that predict() can be called gs.predict(2, 4) # assert test() and predict() cannot be used when refit is false gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=False) gs.fit(data) with pytest.raises(ValueError): gs_preds = gs.test(data.construct_testset(data.raw_ratings)) with pytest.raises(ValueError): gs.predict('1', '2') # test that error is raised if used with load_from_folds gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) with pytest.raises(ValueError): gs.fit(u1_ml100k)