Beispiel #1
0
def test_gridsearchcv_refit(u1_ml100k):
    """Test refit function of GridSearchCV."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, Reader('ml-100k'))

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [2]
    }

    # assert gs.fit() and gs.test will use best estimator for mae (first
    # appearing in measures)
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=True)
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    mae_preds = gs.best_estimator['mae'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == mae_preds

    # assert gs.fit() and gs.test will use best estimator for rmse
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit='rmse')
    gs.fit(data)
    gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    rmse_preds = gs.best_estimator['rmse'].test(
        data.construct_testset(data.raw_ratings))
    assert gs_preds == rmse_preds
    # test that predict() can be called
    gs.predict(2, 4)

    # assert test() and predict() cannot be used when refit is false
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=False)
    gs.fit(data)
    with pytest.raises(ValueError):
        gs_preds = gs.test(data.construct_testset(data.raw_ratings))
    with pytest.raises(ValueError):
        gs.predict('1', '2')

    # test that error is raised if used with load_from_folds
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae', 'rmse'],
                      cv=2,
                      refit=True)
    with pytest.raises(ValueError):
        gs.fit(u1_ml100k)
def gridS(data):  #Find best parameters for KNN and SVDPP
    print('\nLet us check best parameters for KNN Means algorithm\n')
    options = {
        "name": ["msd", "cosine"],
        "min_support": [2, 3, 4, 5],
        "user_based": [False, True],
    }

    knn_grid = {"sim_options": options}
    knn = GridSearchCV(KNNWithMeans,
                       knn_grid,
                       measures=["rmse", "mae", "mse"],
                       cv=5,
                       n_jobs=5)
    knn.fit(data)
    print("\nKNN Means Analysis\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(knn.best_score["rmse"],
                                                  knn.best_score["mae"],
                                                  knn.best_score["mse"]))
    print("\nBest Combination of Parameters\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(knn.best_params["rmse"],
                                                  knn.best_params["mae"],
                                                  knn.best_params["mse"]))

    print('\nWe will see which options are best fit for SVDPP algorithm')
    svd_grid = {
        "n_epochs": [5, 10, 15, 20, 25],
        "lr_all": [0.002, 0.005, 0.008, 0.009],
        "reg_all": [0.4, 0.6, 0.8]
    }
    '''svd = GridSearchCV(SVD, svd_grid, measures=["rmse", "mae", "mse"], cv=5, n_jobs=5)
    svd.fit(data) 
    print("\nSVD Analysis\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(svd.best_score["rmse"],svd.best_score["mae"],svd.best_score["mse"]))
    print("\nBest Combination of Parameters\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(svd.best_params["rmse"],svd.best_params["mae"],svd.best_params["mse"]))'''

    svdpp = GridSearchCV(SVDpp,
                         svd_grid,
                         measures=["rmse", "mae", "mse"],
                         cv=5,
                         n_jobs=5)
    svdpp.fit(data)
    print("\nSVDpp Analysis\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(svdpp.best_score["rmse"],
                                                  svdpp.best_score["mae"],
                                                  svdpp.best_score["mse"]))
    print("\nBest Combination of Parameters\n")
    print("\nRMSE: {}, MAE: {}, MSE: {}\n".format(svdpp.best_params["rmse"],
                                                  svdpp.best_params["mae"],
                                                  svdpp.best_params["mse"]))

    print(
        "\nWe will train model based on parameter values best suited for RMSE reduction\n"
    )
    return [knn.best_params["rmse"]["sim_options"], svdpp.best_params["rmse"]]
Beispiel #3
0
def find_best_params(data_set, cv=3, param_grid=None):

    if param_grid is None:
        param_grid = {
            'n_factors': [10, 30, 50],
            'n_epochs': [10, 30, 50],
            'lr_all': [0.002, 0.005, 0.008, 0.01],
            'reg_all': [0.2, 0.4, 0.6, 0.8]
        }

    log.info(f'Performing Grid Search: {param_grid}')

    gs = GridSearchCV(SVD,
                      param_grid=param_grid,
                      measures=['rmse', 'mae'],
                      cv=cv,
                      n_jobs=4,
                      joblib_verbose=2)
    start_time = time.time()
    gs.fit(data_set)
    end_time = time.time()
    log.info(f'Time spend on Grid Search: {end_time - start_time}')

    log.info(
        f"Best RMSE score: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}"
    )
    log.info(
        f"Best MAE score: {gs.best_score['mae']} with params: {gs.best_params['mae']}"
    )

    return gs.best_params['rmse'], gs.best_params['mae']
    def perform_grid_search_with_cv(self, train_set):
        """
        Perform grid search to get optimal parameters and get metrics after cross validation
        :param train_set: The train set
        :return: Different RMSE and MAE for the different hyper parameters
        """
        if train_set:
            print("Running grid search to find optimal hyper parameters")
            self.LOG_HANDLE.info(
                "Running grid search to find optimal hyper parameters")

            param_grid = {
                'n_epochs': [10, 20, 30],
                'lr_all': [0.005, 0.006, 0.007, 0.008],
                'reg_all': [0.01, 0.02, 0.03, 0.2]
            }
            gs = GridSearchCV(
                SVDpp,
                param_grid,
                measures=model_params.all_models_training_error_measures,
                cv=model_params.cross_validation_folds)
            gs.fit(train_set)

            # best RMSE score
            print(gs.best_score['rmse'])
            self.LOG_HANDLE.info(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])
            self.LOG_HANDLE.info(gs.best_params['rmse'])
    def perform_grid_search_with_cv(self, train_set):
        """
        Perform grid search to get optimal parameters and get metrics after cross validation
        :param train_set: The train set
        :return: Different RMSE and MAE for the different hyper parameters
        """
        if train_set:
            print("Running grid search to find optimal hyper parameters")
            self.LOG_HANDLE.info(
                "Running grid search to find optimal hyper parameters")

            param_grid = {
                'k': [30, 40, 50],
                'min_k': [1, 3, 5],
                'sim_options': {
                    'name': ['cosine', 'pearson', 'msd'],
                    'user_based': [False]
                }
            }
            gs = GridSearchCV(
                KNNWithMeans,
                param_grid,
                measures=model_params.all_models_training_error_measures,
                cv=model_params.cross_validation_folds)
            gs.fit(train_set)

            # best RMSE score
            print("Best RMSE after CV: ")
            print(gs.best_score['rmse'])
            self.LOG_HANDLE.info(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print("Best parameters after CV: ")
            print(gs.best_params['rmse'])
            self.LOG_HANDLE.info(gs.best_params['rmse'])
Beispiel #6
0
def KNN_Tester(trainset, testset, algo):
    param_grid = {
        'k': [50, 100],
        'sim_options': {
            'name': ['msd', 'cosine', 'pearson']
        }
    }

    gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5)
    gs.fit(data)
    params = gs.best_params['rmse']
    algo = KNNBasic(k=params['k'], sim_options=params['sim_options'])
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions)
    precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4)
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    metrics = {
        'rmse': rmse,
        'avg_precision': avg_precision,
        'avg_recall': avg_recall,
        'best_parameters': params
    }
    return metrics
Beispiel #7
0
def BaselineOnly_sgd():
    print('Testing BaselineOnly sgd parameters')
    param_grid = {
        'bsl_options': {
            'method': ['sgd'],
            'learning_rate': [0.00643, 0.00646, 0.00649],
            'n_epochs': [43, 44, 45, 46, 47]
        }
    }
    gs = GridSearchCV(BaselineOnly,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse']['bsl_options'],
                'BaselineOnly.pkl',
                compress=1)
    dump.dump('BaselineOnly', algo=gs)
Beispiel #8
0
def grid_search():
    """ grid search template """
    
    # Set Grid Parameters
    G = gsp.graphs.Graph(dd.build_friend_friend())
    G.compute_laplacian('normalized')
    param_grid = {
            #'L' : [G.L.todense()],
            'n_factors' : [5],
            'n_epochs' : [30],
            'lr_all' : [1.e-3],
            'reg_all' : np.logspace(-6,-1, 20),
            #'reg' : np.logspace(-6,-1,15)
    }
    
    # Init grid_search
    grid = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=6, n_jobs=1, joblib_verbose=10000)
    grid.fit(data)
    
    # Print best score and best parameters
    print('Best Score: ', grid.best_score['rmse'])
    print('Best parameters: ', grid.best_params['rmse'])
    
    # Plot RMSE
    plt.plot(grid.cv_results['param_reg_all'], grid.cv_results['mean_test_rmse'])
Beispiel #9
0
def SVD_alg():
    print('Testing SVD parameters')
    param_grid = {
        'n_epochs': [12, 13],
        'lr_all': [0.0013, 0.0015],
        'reg_all': [0.05, 0.06]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)

    #runs fit method for all parameter combinations over splits given by cv
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse'], 'SVD.pkl', compress=1)
    dump.dump('SVD', algo=gs)
def get_surprise_knn_item_model(data, trainset, testset,
                                model_train_evaluation, model_test_evaluation,
                                error_table):
    param_grid = {
        'sim_options': {
            'name': ["pearson_baseline"],
            "user_based": [False],
            "min_support": [2],
            "shrinkage": [60, 80, 80, 140]
        },
        'k': [5, 20, 40, 80]
    }
    gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3)
    gs.fit(data)
    st.write("GRIDSEARCH best scores", gs.best_score['rmse'])
    st.write("GRIDSEARCH best parameters", gs.best_params['rmse'])
    sim_options = {
        'name': 'pearson_baseline',
        'user_based': False,
        'min_support': 2,
        'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage']
    }
    bsl_options = {'method': 'sgd'}
    algo = KNNBaseline(k=gs.best_params['rmse']['k'],
                       sim_options=sim_options,
                       bsl_options=bsl_options)
    train_result, test_result, error_table = run_surprise(
        algo, trainset, testset, "KNNBaseline_Item", error_table)
    model_train_evaluation["KNNBaseline_Item"] = train_result
    model_test_evaluation["KNNBaseline_Item"] = test_result
    return model_train_evaluation, model_test_evaluation, error_table
def grid_search(data):
    """
        This function was originally used to perform grid search on the different algorithms
    """
    # ---------------------KNN--------------------
    #sim_options = {
    #    "name": "mcd",
    #    "min_support": 3,
    #    "user_based": True
    #}
    #param_grid = {"sim_options": sim_options}
    #gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=3)
    #gs.fit(data)
    #print(gs.best_score["rmse"])
    #print(gs.best_params["rmse"])

    # ---------------SVD--------------
    param_grid = {
        "n_epochs": [5, 10, 20],
        "n_factors": [10, 15, 30]
    }
    gs = GridSearchCV(NMF, param_grid, measures=["rmse"], cv=3)

    gs.fit(data)

    print(gs.best_score["rmse"])
    print(gs.best_params["rmse"])
    def tune(self,
             opt_field='rmse',
             param_grid={
                 'n_epochs': [5, 10],
                 'lr_all': [0.002, 0.005],
                 'reg_all': [0.4, 0.6]
             },
             SHOW_RESULT=False):

        if self.algorithm == 'svd':
            gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

        ## Start tuning
        gs.fit(self.data)

        ## Save to self.algo
        self.algo = gs.best_estimator[opt_field]
        self.algo.fit(self.trainset)

        if SHOW_RESULT:
            # best RMSE score
            print(gs.best_score['rmse'])

            # combination of parameters that gave the best RMSE score
            print(gs.best_params['rmse'])

        return self
Beispiel #13
0
def try_recom_algorithm_grid(data, algo, filename, grid_options, n_splits=5):
    """
    Function that tries out the recommendation algorithms supported by Surprise library,
    but first it tunes the hyperparameters using grid search
    :param data: input data containing user, item, rating and timestamp(opt)
    :param algo: the recom. algorithm to be used
    :param filename: name of the file the results should be saved into
    :param grid_options: dictionary containing possible values range for each parameter
    :param n_splits: number of folds for the cross validation
    :return:
    """
    print("\nWorking on " + filename + "\n")
    file = open("../results_surprise_163K/" + filename + ".txt", "w+")

    # use grid search cross validation using the given grid options
    gs = GridSearchCV(algo,
                      grid_options,
                      measures=['rmse', 'mae'],
                      cv=n_splits)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])
    file.write("RMSE: %f" % (gs.best_score['rmse']))

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
    file.write("Best params:")
    file.write(str(gs.best_params['rmse']))
    file.close()
Beispiel #14
0
    def tune_and_find_parameter(self,algo_name, algo, rating_data,param_grid):
        """
            use GridSearchCVcomputes which (from surpise documentation)
            computes accuracy metrics for an algorithm on various combinations of parameters,
            over a cross-validation procedure.

            Args:
                param1: algo_name : the name of the algorithm
                param2: algo: the algorithm itself
                param3: rating_data: the whole dataset

            Return:best k found
        """


        print("tuning for", algo_name, "hyperparameters")

        # algo: algo class name
        grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae'])
        grid_search.fit(rating_data)

        print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse'])

        best_params = grid_search.best_params['rmse']
        # print the best set of parameters
        print("best params:", best_params)
        return best_params
Beispiel #15
0
def BaselineOnly_als():
    print('Testing BaselineOnly als parameters')
    param_grid = {
        'bsl_options': {
            'method': ['als'],
            'reg_i': [7, 6.9, 7.1],
            'reg_u': [7, 6.9, 7.1]
        }
    }
    gs = GridSearchCV(BaselineOnly,
                      param_grid,
                      measures=['rmse'],
                      cv=10,
                      n_jobs=-2,
                      refit=True)
    gs.fit(data)

    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    #export model
    joblib.dump(gs.best_params['rmse']['bsl_options'],
                'BaselineOnly.pkl',
                compress=1)
    dump.dump('BaselineOnly', algo=gs)
Beispiel #16
0
def grid():
    raw_ratings = data.raw_ratings
    threshold = int(.9 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:]

    data.raw_ratings = A_raw_ratings
    param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
    grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
    grid_search.fit(data)
    algo = grid_search.best_estimator['rmse']

    # retrain on the whole set A
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Compute biased accuracy on A
    predictions = algo.test(trainset.build_testset())
    print('Biased accuracy on A,', end='   ')
    accuracy.rmse(predictions)

    # Compute unbiased accuracy on B
    testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
    predictions = algo.test(testset)
    print('Unbiased accuracy on B,', end=' ')
    accuracy.rmse(predictions)
Beispiel #17
0
def test_gridsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(u1_ml100k)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         u1_ml100k,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
    def grid_search(self):
        print('grid search...')
        sim_options = {
            "name": ["msd", "cosine"],
            "min_support": [3, 4],
            "user_based": [False]
        }
        param_grid = {
            "sim_options": sim_options,
            "k": [50, 100, 200],
            "min_k": [1]
        }
        gs = GridSearchCV(KNNWithMeans,
                          param_grid,
                          measures=["rmse", "mae"],
                          cv=3)
        gs.fit(self.model_data)
        best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"]
        print(f'Best score (RMSE): {best_score}')
        print(f'Best params (RMSE): {best_params}')

        print(f'Best score (MAE): {gs.best_score["mae"]}')
        print(f'Best params (RMSE): {gs.best_params["mae"]}')

        self.set_model_params(best_params)

        return best_params
def test():
    seed = 0
    random.seed(seed)
    np.random.seed(seed)

    param_grid: Dict[str, List[object]] = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 20, 50],
        'biased': [True, False],
        'init_mean': [0, 0.1, 0.5],
        'init_std_dev': [0, 0.1, 0.5],
        'lr_all': [0.001, 0.005, 0.01],
        'reg_all': [0.01, 0.02, 0.05],
        'random_state': [None],
        'verbose': [True]
    }

    grid_search = GridSearchCV(
        algo_class=SVD,
        param_grid=param_grid,
        measures=['rmse'],
        cv=KFold(5),
        n_jobs=-1
    )

    interactions = load_sorted_test_interactions()
    parsed_data = Parser.parse(interactions)
    grid_search.fit(parsed_data.whole_data_set)

    print(grid_search.best_score['rmse'])
    print(grid_search.best_params['rmse'])
    print(grid_search.cv_results)

    add_results_to_database(grid_search.cv_results, "svd", cls=NumpyEncoder)
 def param_selection(self):
     """
     select the best parameter for SVD, using cross-validation
     :param data:
     :return: SVD paramters
     """
     tuned_parameters = {
         'n_factors': [20, 50, 100],
         'reg_all': [0.04, 0.05]
     }
     grid_search = GridSearchCV(SVD,
                                tuned_parameters,
                                measures=['rmse', 'mae'],
                                cv=3)
     grid_search.fit(self.trainset)
     print("Best parameters using RMSE:")
     print(grid_search.best_params['rmse'])
     print()
     self.n_factors = grid_search.best_params['mae'].get('n_factors')
     self.reg_all = grid_search.best_params['mae'].get('reg_all')
     print("Best score using RMSE:")
     print(grid_search.best_score['rmse'])
     print()
     print("Best parameters using MAE:")
     print(grid_search.best_params['mae'])
     print()
     print("Best score using MAE:")
     print(grid_search.best_score['mae'])
     print()
Beispiel #21
0
def do_grid_search(data):
    print("Doing gridsearch for best model.")
    param_grid = {
        'n_epochs': [10, 20, 30],
        'n_factors': [100, 150, 200],
        'lr_all': [0.001, 0.0025, 0.005, 0.001],
        'reg_all': [0.2, 0.4, 0.6]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['rmse', 'mae'],
                      cv=5,
                      joblib_verbose=5,
                      n_jobs=-1)

    gs.fit(data_handler.get_data_from_df(data))
    # best RMSE score
    print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])

    # We can now use the algorithm that yields the best rmse:
    algo = gs.best_estimator['rmse']
    return algo
Beispiel #22
0
def test_gridsearchcv_same_splits():
    """Ensure that all parameter combinations are tested on the same splits (we
    check their RMSE scores are the same once averaged over the splits, which
    should be enough). We use as much parallelism as possible."""

    data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_file(data_file, reader=Reader('ml-100k'))
    kf = KFold(3, shuffle=True, random_state=4)

    # all RMSE should be the same (as param combinations are the same)
    param_grid = {'n_epochs': [5], 'lr_all': [.2, .2],
                  'reg_all': [.4, .4], 'n_factors': [5], 'random_state': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['RMSE'], cv=kf,
                      n_jobs=-1)
    gs.fit(data)

    rmse_scores = [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal

    # Note: actually, even when setting random_state=None in kf, the same folds
    # are used because we use product(param_comb, kf.split(...)). However, it's
    # needed to have the same folds when calling fit again:
    gs.fit(data)
    rmse_scores += [m for m in gs.cv_results['mean_test_rmse']]
    assert len(set(rmse_scores)) == 1  # assert rmse_scores are all equal
Beispiel #23
0
def load_data():
    data = Dataset.load_builtin('ml-100k')
    # similarity options
    sim_options = {"name": "msd", "user_based": False}

    param_grid = {
        "n_epochs": [5, 10],
        "lr_all": [0.002, 0.005],
        "reg_all": [0.4, 0.6]
    }

    # algorithm
    algo = KNNWithMeans(sim_options=sim_options)

    # computation
    training_set = data.build_full_trainset()

    algo.fit(training_set)

    # GRID SEACH, MATRIX FACTORIZATION
    print("Divide matrix in grids")
    gs = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
Beispiel #24
0
def tuneHyperParams(algtype, trainset, testset, df, param_grid):
    """
    Tune Hyper Parameters for Surprise library models
    Args:
        algtype (surprise.prediction_algorithms): type of the surprise algorithm
        trainset(pandas.Dataframe) :
        testset(pandas.Dataframe) :
        df(pandas.Dataframe) :
        param_grid : parameters to try
    Returns:
        surprise.GridSearchCV: gs
    """
    #TUNE HYPERPARAM VIA GRIDSEARCH
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['User', 'Movie', 'Rating']], reader)
    #trainset, testset = train_test_split(data, test_size=.25, random_state=20)
    gs = GridSearchCV(algtype, param_grid, measures=['rmse'], cv=3)

    model = gs.fit(data)

    # best RMSE score
    #print(gs.best_score['rmse'])

    # combination of parameters that gave the best RMSE score
    #print(gs.best_params['rmse'])

    return gs
Beispiel #25
0
    def __init__(self, data, score_index, user_index, items_index):

        self.items_index = items_index
        self.user_index = user_index
        self.data = data

        scale = (data[score_index].min(), data[score_index].max())
        reader = Reader(rating_scale=scale)
        dataset = Dataset.load_from_df(
            data[[user_index, items_index, score_index]], reader)

        param_grid = {
            'n_factors': [50, 100, 150],
            'n_epochs': [25, 50, 75],
            'lr_all': [0.005, 0.01],
            'reg_all': [0.02, 0.1, 0.5]
        }

        gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
        gs.fit(dataset)

        params = gs.best_params['rmse']

        self.model = SVD(reg_all=params['reg_all'],
                         n_factors=params['n_factors'],
                         n_epochs=params['n_epochs'],
                         lr_all=params['lr_all'])
        self.model.fit(dataset.build_full_trainset())
    def tune_and_find_param(self,
                            algo_name,
                            algo,
                            rating_data,
                            param_grid={
                                'n_factors': [50, 100],
                                'n_epochs': [20, 30],
                                'lr_all': [0.005, 0.010]
                            }):
        # use GridSearchCVcomputes which (from surpise documentation)
        # computes accuracy metrics for an algorithm on various combinations of parameters, over a cross-validation procedure.
        print("tuning for", algo_name, "hyperparameters")

        # algo: algo class name
        grid_search = GridSearchCV(algo, param_grid, measures=['rmse', 'mae'])

        # fitting data
        grid_search.fit(rating_data)

        # print the best RMSE
        print('best RMSE for ', algo_name, ' ', grid_search.best_score['rmse'])

        best_params = grid_search.best_params['rmse']
        # print the best set of parameters
        print("best params:", best_params)
        return best_params
Beispiel #27
0
def test_best_estimator():
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train')
    test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test')
    data = Dataset.load_from_folds([(train_file, test_file)],
                                   Reader('ml-100k'))

    param_grid = {
        'n_epochs': [5],
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6],
        'n_factors': [1],
        'init_std_dev': [0]
    }
    gs = GridSearchCV(SVD,
                      param_grid,
                      measures=['mae'],
                      cv=PredefinedKFold(),
                      joblib_verbose=100)
    gs.fit(data)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator,
                         data,
                         measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Beispiel #28
0
def gridsearch(data, algo, param_grid):
    # param_grid = {'n_factors': [50, 100, 150], 'n_epochs': [20, 30],
    #               'lr_all': [0.005, 0.01], 'reg_all': [0.02, 0.1]}

    gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3)
    gs.fit(data)
    params = gs.best_params['rmse']
    print(params)
Beispiel #29
0
 def best_params(self):
     param_grid = {
         'n_factors': [x for x in range(50, 500, 50)],
         'n_epochs': [10, 20, 50, 75, 100],
         'lr_all': [.001, .003, .005, .008]
     }
     gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
     gs.fit(self.data)
     return gs.best_score['rmse'], gs.best_params['rmse']
Beispiel #30
0
def best_params():
    # Dataset de reviews a utilizar
    file_path = os.path.expanduser('./data/reviews_stars.csv')
    reader = Reader(line_format='user item rating', sep=',')
    data = Dataset.load_from_file(file_path, reader=reader)

    # Se crea una lista de posibles valores de factores
    n_factors_values = []
    n_factors_initial_value = 2
    # Se prueban 15 distintos factores en intervalos de 2
    for i in range(0, 15):
        n_factors_values.append(n_factors_initial_value +
                                (n_factors_initial_value * i))

    # Se crea una lista de posibles epochs
    n_epochs_values = []
    n_epochs_initial_value = 5
    # Se prueba 10 valores distintos en intervalos de 5
    for i in range(0, 10):
        n_epochs_values.append(n_epochs_initial_value +
                               (n_epochs_initial_value * i))

    # Se crea una lista de posibles parámetros de regularización
    reg_all_values = []
    reg_all_initial_value = 0.2
    # Se prueban 5 valores distintos en intervalos de 0.2
    for i in range(0, 5):
        reg_all_values.append(reg_all_initial_value +
                              (reg_all_initial_value * i))

    # Se crea una lista de posibles learning rates
    lr_all_values = []
    lr_all_initial_value = 0.002
    # Se prueban 5 valores distintos en intervalos de 0.002
    for i in range(0, 5):
        lr_all_values.append(lr_all_initial_value + (lr_all_initial_value * i))

    # Se crea el diccionario de parámetros
    param_grid = {
        'n_factors': n_factors_values,
        'n_epochs': n_epochs_values,
        'lr_all': lr_all_values,
        'reg_all': reg_all_values,
        'biased': [True]
    }
    # Se prueban los parámetros utilizando MAE y RMSE
    gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

    gs.fit(data)

    # Se escribe en un archivo los resultados de los mejores parámetros RMSE y MAE
    with open('./data/results.txt', 'a') as file:
        file.write('Score rmse: ' + str(gs.best_score['rmse']) + '\n')
        file.write('Best parameters rmse: ' + str(gs.best_params['rmse']) +
                   '\n')
        file.write('Score mae: ' + str(gs.best_score['mae']) + '\n')
        file.write('Best parameters mae: ' + str(gs.best_params['mae']) + '\n')