Ejemplo n.º 1
0
def test_cross_validate(toy_data):

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating', sep=' ', skip_lines=3)
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader,
                                   rating_scale=(1, 5))

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_rmse' not in ret
    assert 'train_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None,
                            return_train_measures=True, verbose=True)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_rmse']) == 5
    assert len(ret['train_mae']) == 5
Ejemplo n.º 2
0
def test_user_based_field(u1_ml100k, pkf):
    """Ensure that the user_based field is taken into account (only) when
    needed."""

    algorithms = (KNNBasic, KNNWithMeans, KNNBaseline)
    for klass in algorithms:
        algo = klass(sim_options={'user_based': True})
        rmses_user_based = cross_validate(algo, u1_ml100k, ['rmse'],
                                          pkf)['test_rmse']
        algo = klass(sim_options={'user_based': False})
        rmses_item_based = cross_validate(algo, u1_ml100k, ['rmse'],
                                          pkf)['test_rmse']
        assert rmses_user_based != rmses_item_based
Ejemplo n.º 3
0
def test_SVDpp_parameters(u1_ml100k, pkf):
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVDpp(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    # n_factors
    algo = SVDpp(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_factors

    # The rest is OK but just takes too long for now...
    """
Ejemplo n.º 4
0
 def trainer(self):
     # Set the random seed that numpy (used internally by Surprise) will use.
     my_seed = random.randint(0, 2**32)
     random.seed(my_seed)
     numpy.random.seed(my_seed)
     # Reassurance that the script is actually running.
     self.printer(
         "\nNow training on the MovieLens latest small dataset. (8 folds used)"
     )
     self.printer("Please wait...\n")
     # Define the file's format
     reader = Reader(line_format='user item rating timestamp', sep=',')
     # Load the data from the ratings.csv file
     data = Dataset.load_from_file('./ml-latest-small/ratings.csv',
                                   reader=reader)
     # Use the SVD algorithm for prediction
     method = SVD()
     start = time.time()
     # Use 8-fold cross validation and evaluate the results with RMSE and MAE
     measurements = cross_validate(method,
                                   data,
                                   measures=['RMSE', 'MAE'],
                                   cv=8,
                                   verbose=False,
                                   n_jobs=-2,
                                   return_train_measures=True)
     # Print the random seed used for fold assignments
     self.printer(
         "Random seed used for fold assignment: {}\n".format(my_seed))
     # Show the stats
     meanFitTime = numpy.mean(measurements["fit_time"])
     meanTestTime = numpy.mean(measurements["test_time"])
     meanTestMAE = numpy.mean(measurements["test_mae"])
     meanTestRMSE = numpy.mean(measurements["test_rmse"])
     meanTrainMAE = numpy.mean(measurements["train_mae"])
     meanTrainRMSE = numpy.mean(measurements["train_rmse"])
     self.printer(
         "Mean fit time per fold: {:0.5f} seconds".format(meanFitTime))
     self.printer(
         "Mean test time per fold: {:0.5f} seconds".format(meanTestTime))
     self.printer("Mean train MAE per fold: {:0.5f}".format(meanTrainMAE))
     self.printer("Mean train RMSE per fold: {:0.5f}".format(meanTrainRMSE))
     self.printer("Mean test MAE per fold: {:0.5f}".format(meanTestMAE))
     self.printer("Mean test RMSE per fold: {:0.5f}\n".format(meanTestRMSE))
     # Train with the dataset
     trainset = data.build_full_trainset()
     method.fit(trainset)
     end = time.time()
     spent = end - start
     self.printer(
         "Training and testing time: {:0.3f} seconds\n".format(spent))
     process = psutil.Process(os.getpid())
     self.printer("Memory used:")
     self.printer("{:0.5f}".format(process.memory_info().rss / 1048576.0) +
                  " MB Physical")
     self.printer("{:0.5f}".format(process.memory_info().vms / 1048576.0) +
                  " MB Virtual")
     return method, trainset
Ejemplo n.º 5
0
def test_shrinkage_field(u1_ml100k, pkf):
    """Ensure the shrinkage field is taken into account."""

    sim_options = {'name': 'pearson_baseline',
                   'shrinkage': 0
                   }
    bsl_options = {'n_epochs': 1}
    algo = KNNBasic(sim_options=sim_options)
    rmse_shrinkage_0 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'pearson_baseline',
                   'shrinkage': 100
                   }
    bsl_options = {'n_epochs': 1}
    algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
    rmse_shrinkage_100 = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    assert rmse_shrinkage_0 != rmse_shrinkage_100
Ejemplo n.º 6
0
def test_sgd_reg_field():
    """Ensure the reg field is taken into account."""

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   'reg': 0.02,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_reg_002 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   'reg': 1,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_reg_1 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    assert rmse_sgd_reg_002 != rmse_sgd_reg_1
Ejemplo n.º 7
0
def repeat(algo_type, frame, min_, max_):
    reader = Reader(rating_scale=(min_,max_))
    data = Dataset.load_from_df(frame, reader=reader)
    algo = algo_type
    print(cross_validate(algo, data, measures=['RMSE','MAE'], cv = 3, verbose = True))
    user_id = 'A3R5OBKS7OM2IR'
    movie_id = 'Movie1'
    rating = 5.0
    algo.predict(user_id, movie_id, r_ui=rating, verbose = True)
Ejemplo n.º 8
0
def test_als_n_epochs_field():
    """Ensure the n_epochs field is taken into account."""

    bsl_options = {'method': 'als',
                   'n_epochs': 1,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_n_epochs_1 = cross_validate(algo, data, ['rmse'],
                                         pkf)['test_rmse']

    bsl_options = {'method': 'als',
                   'n_epochs': 5,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_n_epochs_5 = cross_validate(algo, data, ['rmse'],
                                         pkf)['test_rmse']

    assert rmse_als_n_epochs_1 != rmse_als_n_epochs_5
Ejemplo n.º 9
0
def test_sgd_learning_rate_field():
    """Ensure the learning_rate field is taken into account."""

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   'learning_rate': .005,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_lr_005 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   'learning_rate': .00005,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_lr_00005 = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    assert rmse_sgd_lr_005 != rmse_sgd_lr_00005
def do_predict():
    ratings_dic = {
        "userId": userGroupId,
        "itemId": ingredientId,
        "rating": ratings
    }
    df = pd.DataFrame(ratings_dic)
    reader = Reader(rating_scale=(1, 4))
    data = Dataset.load_from_df(df[['userId', 'itemId', 'rating']], reader)
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)
    cross_validate(
        algo, data, measures=['RMSE', 'MAE'], cv=5,
        verbose=True)  # Root Mean Square Error # Mean absolute error
    get_top_n(predictions)
Ejemplo n.º 11
0
 def getResultFilerCollaborative(self):
     # instaciamos un lector
     reader = Reader()
     # Instanciamos la descomposicion de valor singular
     svd = SVD()
     # Obtenemos los datos de la base de datos
     #self.ratings_data = pd.read_csv('calificaciones.csv')
     # obtenemos un dataset conformado por valores como id_usuario, id_cuestionario, calificacion.
     data = Dataset.load_from_df(
         self.ratings_data[['id_user', 'id_quest', 'calificacion']], reader)
     # Obtenemos un conjunto de prueba
     #trainset = data.build_full_trainset()
     # Entrenemos el algoritmo con el conjunto de pruebas
     #svd.fit(trainset)
     # evaluamos la predicciones con  RMSE Y MAE
     cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
     # retornamos
     return svd
Ejemplo n.º 12
0
def test_sgd_n_epoch_field():
    """Ensure the n_epoch field is taken into account."""

    bsl_options = {'method': 'sgd',
                   'n_epochs': 1,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_1 = \
    cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse']

    bsl_options = {'method': 'sgd',
                   'n_epochs': 20,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_5 = \
    cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse']

    assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
Ejemplo n.º 13
0
def objective(**params):
    print(params)
    svd_algo = SVD(**params, random_state=8)
    results = cross_validate(svd_algo,
                             data,
                             measures=['rmse'],
                             cv=5,
                             n_jobs=-1)
    return np.mean(results['test_rmse'])
Ejemplo n.º 14
0
def test_als_reg_i_field():
    """Ensure the reg_i field is taken into account."""

    bsl_options = {'method': 'als',
                   'reg_i': 0,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_0 = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
        'test_neg_rmse']

    bsl_options = {'method': 'als',
                   'reg_i': 10,
                   }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_10 = \
    cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)['test_neg_rmse']

    assert rmse_als_regi_0 != rmse_als_regi_10
Ejemplo n.º 15
0
def test_cross_validate():

    # First test with a specified CV iterator.
    current_dir = os.path.dirname(os.path.realpath(__file__))
    folds_files = [(current_dir + '/custom_train',
                    current_dir + '/custom_test')]

    reader = Reader(line_format='user item rating',
                    sep=' ',
                    skip_lines=3,
                    rating_scale=(1, 5))
    data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)

    algo = NormalPredictor()
    pkf = ms.PredefinedKFold()
    ret = ms.cross_validate(algo,
                            data,
                            measures=['rmse', 'mae'],
                            cv=pkf,
                            verbose=1)
    # Basically just test that keys (dont) exist as they should
    assert len(ret['test_rmse']) == 1
    assert len(ret['test_mae']) == 1
    assert len(ret['fit_time']) == 1
    assert len(ret['test_time']) == 1
    assert 'test_fcp' not in ret
    assert 'train_rmse' not in ret
    assert 'train_mae' not in ret

    # Test that 5 fold CV is used when cv=None
    # Also check that train_* key exist when return_train_measures is True.
    data = Dataset.load_from_file(current_dir + '/custom_dataset', reader)
    ret = ms.cross_validate(algo,
                            data,
                            measures=['rmse', 'mae'],
                            cv=None,
                            return_train_measures=True,
                            verbose=True)
    assert len(ret['test_rmse']) == 5
    assert len(ret['test_mae']) == 5
    assert len(ret['fit_time']) == 5
    assert len(ret['test_time']) == 5
    assert len(ret['train_rmse']) == 5
    assert len(ret['train_mae']) == 5
Ejemplo n.º 16
0
def model(data, datacsv):
    """
	建立模型
	:return:
	"""
    #使用SVD模型
    algo = SVD()
    #进行5折交叉验证
    # print(cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True))
    print(cross_validate(NormalPredictor(), datacsv, cv=2))
Ejemplo n.º 17
0
def five_fold(df):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user id', 'movie id', 'rating']], reader)
    algo = SVD()
    out = cross_validate(algo,
                         data,
                         measures=['RMSE', 'MAE'],
                         cv=5,
                         verbose=True)
    return algo.fit(data.build_full_trainset())
def KNN_best(data, opt=True):
    sim_options = {
        "name": "msd",
        "min_support": 3,
        "user_based": opt
    }
    algo = KNNWithMeans(sim_options=sim_options)
    pred = cross_validate(algo, data, measures=['mse'], cv=5, verbose=True)
    mean = np.mean(pred['test_mse'])
    return mean
Ejemplo n.º 19
0
def SVDpp_calculation(data , trainset, testset, time, cv):
    start = time.time()
    algo = SVDpp()
    algo.fit(trainset)
    predictions = algo.test(testset)
    cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True)
    end = time.time()
    time = end-start
    
    return time, cross_validate_svdpp_dict
Ejemplo n.º 20
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    
    # 5 fold validation
      score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Ejemplo n.º 21
0
def test_sgd_n_epoch_field(u1_ml100k, pkf):
    """Ensure the n_epoch field is taken into account."""

    bsl_options = {
        'method': 'sgd',
        'n_epochs': 1,
    }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_1 = cross_validate(algo, u1_ml100k, ['rmse'],
                                        pkf)['test_rmse']

    bsl_options = {
        'method': 'sgd',
        'n_epochs': 20,
    }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd_n_epoch_5 = cross_validate(algo, u1_ml100k, ['rmse'],
                                        pkf)['test_rmse']

    assert rmse_sgd_n_epoch_1 != rmse_sgd_n_epoch_5
Ejemplo n.º 22
0
def test_method_field():
    """Ensure the method field is taken into account."""

    bsl_options = {'method': 'als'}
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
        'test_neg_rmse']

    bsl_options = {'method': 'sgd'}
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_sgd = cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
        'test_neg_rmse']

    assert rmse_als != rmse_sgd

    with pytest.raises(ValueError):
        bsl_options = {'method': 'wrong_name'}
        algo = BaselineOnly(bsl_options=bsl_options)
        cross_validate(algo, data, [['neg_rmse', neg_rmse]], pkf)[
            'test_neg_rmse']
Ejemplo n.º 23
0
    def evaluate(self, test_size=.25):
        from surprise.model_selection import cross_validate, train_test_split
        from surprise import accuracy

        recommendation_dataset = RecommendationsDataset()
        cross_validate(self.algorithm,
                       recommendation_dataset.dataset,
                       measures=['RMSE', 'MSE'],
                       cv=5,
                       verbose=True)

        train, test = train_test_split(recommendation_dataset.dataset)
        # train.ur
        # train.ir
        # test
        self.fit(train)
        test_predictions = self.test(test)
        # result
        print("MAE: ", accuracy.mae(test_predictions, verbose=0))
        print("RMSE: ", accuracy.rmse(test_predictions, verbose=0))
Ejemplo n.º 24
0
def test_als_reg_i_field(u1_ml100k, pkf):
    """Ensure the reg_i field is taken into account."""

    bsl_options = {
        'method': 'als',
        'reg_i': 0,
    }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_0 = cross_validate(algo, u1_ml100k, ['rmse'],
                                     pkf)['test_rmse']

    bsl_options = {
        'method': 'als',
        'reg_i': 10,
    }
    algo = BaselineOnly(bsl_options=bsl_options)
    rmse_als_regi_10 = cross_validate(algo, u1_ml100k, ['rmse'],
                                      pkf)['test_rmse']

    assert rmse_als_regi_0 != rmse_als_regi_10
Ejemplo n.º 25
0
 def worker(self, algo, sema):
     sema.acquire()
     print("worker :: "+algo)
     A = self.algo_map.get(algo)
     data_full = pd.read_csv(self.city+'-reviews-user-business.csv')
     data_required = data_full[['user_id', 'business_id', 'stars']]
     reader = Reader(rating_scale=(1.0, 5.0))
     data = Dataset.load_from_df(data_required, reader)
     cv_results = cross_validate(A.algo, data, measures=['RMSE', 'MAE'], cv=5, n_jobs=1, verbose=False)
     res_df = pd.DataFrame.from_dict(cv_results).mean(axis=0)
     self.return_dict[algo]=res_df
     sema.release()
Ejemplo n.º 26
0
def test_CoClustering_parameters(u1_ml100k, pkf):
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = CoClustering(n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    # n_cltr_u
    algo = CoClustering(n_cltr_u=1, n_epochs=1, random_state=1)
    rmse_n_cltr_u = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_cltr_u

    # n_cltr_i
    algo = CoClustering(n_cltr_i=1, n_epochs=1, random_state=1)
    rmse_n_cltr_i = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_cltr_i

    # n_epochs
    algo = CoClustering(n_epochs=2, random_state=1)
    rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_epochs
def cv_multiple_models(data, models_dict, cv=3):
    results = pd.DataFrame()

    for model_name, model in models_dict.items():
        print('\n---> CV for %s...' % model_name)

        cv_results = cross_validate(model, data, cv=cv)
        tmp = pd.DataFrame(cv_results).mean()
        tmp['model'] = model_name
        results = results.append(tmp, ignore_index=True)

    return results
Ejemplo n.º 28
0
def collaborativeFiltering():
        # Data Collection
    df1=pd.read_csv('tmdb_5000_credits.csv')
    df2=pd.read_csv('tmdb_5000_movies.csv')

    # Joining the two datasets on 'id' column
    df1.columns=['id','title','cast','crew']
    df2=df2.merge(df1.drop('title',axis=1),on='id')
    ratings = pd.read_csv('ml-latest-small/ratings.csv')

    reader=Reader()
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],reader)
    # data.split(n_folds=5)
    svd = SVD()

    # Run 5-fold cross-validation and then print results
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5)
    # here the root mean square error is 0.869- which is great!
    # training our dataset now
    trainset = data.build_full_trainset()
    svd.fit(trainset)
Ejemplo n.º 29
0
def test_CoClustering_parameters():
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = CoClustering(n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']

    # n_cltr_u
    algo = CoClustering(n_cltr_u=1, n_epochs=1, random_state=1)
    rmse_n_cltr_u = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_cltr_u

    # n_cltr_i
    algo = CoClustering(n_cltr_i=1, n_epochs=1, random_state=1)
    rmse_n_cltr_i = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_cltr_i

    # n_epochs
    algo = CoClustering(n_epochs=2, random_state=1)
    rmse_n_epochs = cross_validate(algo, data, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_epochs
Ejemplo n.º 30
0
def Recommendation_MatrixFact_SVD(trainset, testset, dataset):
    algo_svd = SVD() #with default parameters 'n_epochs=20, lr_all=0.005, reg_all=0.02
    fit_svd = algo_svd.fit(trainset)
    predictions = fit_svd.test(testset)
    print("Matrix Factorization SVD Prediction Accuracy : " , accuracy.rmse(predictions, verbose=False))
    print(" ")
    print("-----ACCURACY using SVD------")
    print(" ")
    #print("Evaluate: ", evaluate(algo_svd, dataset, measures=['RMSE']))
    print(cross_validate(algo_svd, dataset, measures=['RMSE', 'MAE'], cv=5,  verbose=True))
    print(" ")
    return predictions
def train_nmf(data):
    rmse = []
    mae = []
    sim_options = {'name': 'pearson'}
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        nmf = NMF(n_factors=k)
        temp = cross_validate(nmf, data, measures=['RMSE', 'MAE'], cv=10)
        rmse.append(np.mean(temp['test_rmse']))
        mae.append(np.mean(temp['test_mae']))
    print("k-fold validation finished!")
    return (rmse, mae)
def train_knn(data):
    rmse = []
    mae = []
    sim_options = {'name': 'pearson'}
    for k in range(2, 102, 2):
        print("using k = %d" % k)
        knn = KNNWithMeans(k=k, sim_options=sim_options)
        temp = cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=10)
        rmse.append(np.mean(temp['test_rmse']))
        mae.append(np.mean(temp['test_mae']))
    print("k-fold validation finished!")
    return (rmse, mae)
Ejemplo n.º 33
0
def get_predictions():
    """
    Get all the predictions and print it on screen
    :return:
    """
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 5))
    data = Dataset.load_from_file(dataset, reader=reader)
    trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)

    testset = trainset.build_anti_testset()
    predictions = algo.test(testset)

    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)

    top_n = get_top_n(predictions, n=10)

    return top_n
Ejemplo n.º 34
0
def compare_models(data):
    # Define probabilistic matrix factorization algorithm
    matrix_fact = prediction_algorithms.matrix_factorization.SVD(biased=False)

    # Define user-based collaborative filtering algorithm
    sim_options = {'name': 'cosine', 'user_based': True}
    user_based = KNNBasic(sim_options=sim_options, verbose=False)

    # Define item-based collaborative filtering algorithm
    sim_options = {'name': 'cosine', 'user_based': False}
    item_based = KNNBasic(sim_options=sim_options, verbose=False)

    # Run 5-fold cross validation on each algorithm and print results.
    cross_validate(algo=matrix_fact,
                   data=data,
                   measures=['rmse', 'mae'],
                   cv=5,
                   verbose=True)
    cross_validate(algo=user_based,
                   data=data,
                   measures=['rmse', 'mae'],
                   cv=5,
                   verbose=True)
    cross_validate(algo=item_based,
                   data=data,
                   measures=['rmse', 'mae'],
                   cv=5,
                   verbose=True)
Ejemplo n.º 35
0
def test_name_field(u1_ml100k, pkf):
    """Ensure the name field is taken into account."""

    sim_options = {'name': 'cosine'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_cosine = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'msd'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_msd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'pearson'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_pearson = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'pearson_baseline'}
    bsl_options = {'n_epochs': 1}
    algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
    rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'],
                                      pkf)['test_rmse']

    for rmse_a, rmse_b in combinations(
        (rmse_cosine, rmse_msd, rmse_pearson, rmse_pearson_bsl), 2):
        assert (rmse_a != rmse_b)

    with pytest.raises(NameError):
        sim_options = {'name': 'wrong_name'}
        algo = KNNBasic(sim_options=sim_options)
        cross_validate(algo, u1_ml100k, ['rmse'], pkf)
Ejemplo n.º 36
0
def test_name_field(u1_ml100k, pkf):
    """Ensure the name field is taken into account."""

    sim_options = {'name': 'cosine'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_cosine = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'msd'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_msd = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'pearson'}
    algo = KNNBasic(sim_options=sim_options)
    rmse_pearson = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    sim_options = {'name': 'pearson_baseline'}
    bsl_options = {'n_epochs': 1}
    algo = KNNBasic(sim_options=sim_options, bsl_options=bsl_options)
    rmse_pearson_bsl = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    for rmse_a, rmse_b in combinations((rmse_cosine, rmse_msd, rmse_pearson,
                                        rmse_pearson_bsl), 2):
        assert (rmse_a != rmse_b)

    with pytest.raises(NameError):
        sim_options = {'name': 'wrong_name'}
        algo = KNNBasic(sim_options=sim_options)
        cross_validate(algo, u1_ml100k, ['rmse'], pkf)
Ejemplo n.º 37
0
def test_gridsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one giving the best score (by
    re-running it)"""

    param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005],
                  'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]}
    gs = GridSearchCV(SVD, param_grid, measures=['mae'],
                      cv=PredefinedKFold(), joblib_verbose=100)
    gs.fit(u1_ml100k)
    best_estimator = gs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == gs.best_score['mae']
Ejemplo n.º 38
0
def test_randomizedsearchcv_best_estimator(u1_ml100k):
    """Ensure that the best estimator is the one that gives the best score (by
    re-running it)"""

    param_distributions = {'n_epochs': [5], 'lr_all': uniform(0.002, 0.003),
                           'reg_all': uniform(0.04, 0.02), 'n_factors': [1],
                           'init_std_dev': [0]}
    rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae'],
                            cv=PredefinedKFold(), joblib_verbose=100)
    rs.fit(u1_ml100k)
    best_estimator = rs.best_estimator['mae']

    # recompute MAE of best_estimator
    mae = cross_validate(best_estimator, u1_ml100k, measures=['MAE'],
                         cv=PredefinedKFold())['test_mae']

    assert mae == rs.best_score['mae']
from surprise import SVD
from surprise import Dataset, print_perf
from surprise.model_selection import cross_validate

# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
algo = SVD()
# 在数据集上测试一下效果
perf = cross_validate(algo, data, measures=['RMSE'], cv=3)# RMSE(均方根误差)
#输出结果
print_perf(perf)
Ejemplo n.º 40
0
"""
This module describes the most basic usage of Surprise: you define a prediction
algorithm, (down)load a dataset and run a cross-validation procedure.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Ejemplo n.º 41
0
def test_SVD_parameters(u1_ml100k, pkf):
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = SVD(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    # n_factors
    algo = SVD(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_factors

    # n_epochs
    algo = SVD(n_factors=1, n_epochs=2, random_state=1)
    rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_epochs

    # biased
    algo = SVD(n_factors=1, n_epochs=1, biased=False, random_state=1)
    rmse_biased = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_biased

    # lr_all
    algo = SVD(n_factors=1, n_epochs=1, lr_all=5, random_state=1)
    rmse_lr_all = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_all

    # reg_all
    algo = SVD(n_factors=1, n_epochs=1, reg_all=5, random_state=1)
    rmse_reg_all = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_all

    # lr_bu
    algo = SVD(n_factors=1, n_epochs=1, lr_bu=5, random_state=1)
    rmse_lr_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_bu

    # lr_bi
    algo = SVD(n_factors=1, n_epochs=1, lr_bi=5, random_state=1)
    rmse_lr_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_bi

    # lr_pu
    algo = SVD(n_factors=1, n_epochs=1, lr_pu=5, random_state=1)
    rmse_lr_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_pu

    # lr_qi
    algo = SVD(n_factors=1, n_epochs=1, lr_qi=5, random_state=1)
    rmse_lr_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_qi

    # reg_bu
    algo = SVD(n_factors=1, n_epochs=1, reg_bu=5, random_state=1)
    rmse_reg_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_bu

    # reg_bi
    algo = SVD(n_factors=1, n_epochs=1, reg_bi=5, random_state=1)
    rmse_reg_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_bi

    # reg_pu
    algo = SVD(n_factors=1, n_epochs=1, reg_pu=5, random_state=1)
    rmse_reg_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_pu

    # reg_qi
    algo = SVD(n_factors=1, n_epochs=1, reg_qi=5, random_state=1)
    rmse_reg_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_qi
from surprise import SVD
from surprise import Dataset, print_perf, Reader
from surprise.model_selection import cross_validate
import os

# 指定文件所在路径
file_path = os.path.expanduser('mydata.csv')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating', sep=',')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
algo = SVD()
# 在数据集上测试一下效果
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
#输出结果
print_perf(perf)
Ejemplo n.º 43
0
                                   'http://grouplens.org/datasets/movielens/1m'),
        }


# set RNG
np.random.seed(0)
random.seed(0)

dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0)  # folds will be the same for all algorithms.

table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)

header = [LINK[dataset],
          'RMSE',
          'MAE',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))
Ejemplo n.º 44
0
"""
This module descibes how to load a dataset from a pandas dataframe.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

import pandas as pd

from surprise import NormalPredictor
from surprise import Dataset
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']],
                            rating_scale=(1, 5))

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)
Ejemplo n.º 45
0
def main():

    class MyParser(argparse.ArgumentParser):
        '''A parser which prints the help message when an error occurs. Taken from
        http://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu.'''  # noqa

        def error(self, message):
            sys.stderr.write('error: %s\n' % message)
            self.print_help()
            sys.exit(2)

    parser = MyParser(
        description='Evaluate the performance of a rating prediction ' +
        'algorithm ' +
        'on a given dataset using cross validation. You can use a built-in ' +
        'or a custom dataset, and you can choose to automatically split the ' +
        'dataset into folds, or manually specify train and test files. ' +
        'Please refer to the documentation page ' +
        '(http://surprise.readthedocs.io/) for more details.',
        epilog="""Example:\n
        surprise -algo SVD -params "{'n_epochs': 5, 'verbose': True}"
        -load-builtin ml-100k -n-folds 3""")

    algo_choices = {
        'NormalPredictor': NormalPredictor,
        'BaselineOnly': BaselineOnly,
        'KNNBasic': KNNBasic,
        'KNNBaseline': KNNBaseline,
        'KNNWithMeans': KNNWithMeans,
        'SVD': SVD,
        'SVDpp': SVDpp,
        'NMF': NMF,
        'SlopeOne': SlopeOne,
        'CoClustering': CoClustering,
    }

    parser.add_argument('-algo', type=str,
                        choices=algo_choices,
                        help='The prediction algorithm to use. ' +
                        'Allowed values are ' +
                        ', '.join(algo_choices.keys()) + '.',
                        metavar='<prediction algorithm>')

    parser.add_argument('-params', type=str,
                        metavar='<algorithm parameters>',
                        default='{}',
                        help='A kwargs dictionary that contains all the ' +
                        'algorithm parameters.' +
                        'Example: "{\'n_epochs\': 10}".'
                        )

    parser.add_argument('-load-builtin', type=str, dest='load_builtin',
                        metavar='<dataset name>',
                        default='ml-100k',
                        help='The name of the built-in dataset to use.' +
                        'Allowed values are ' +
                        ', '.join(dataset.BUILTIN_DATASETS.keys()) +
                        '. Default is ml-100k.'
                        )

    parser.add_argument('-load-custom', type=str, dest='load_custom',
                        metavar='<file path>',
                        default=None,
                        help='A file path to custom dataset to use. ' +
                        'Ignored if ' +
                        '-loadbuiltin is set. The -reader parameter needs ' +
                        'to be set.'
                        )

    parser.add_argument('-folds-files', type=str, dest='folds_files',
                        metavar='<train1 test1 train2 test2... >',
                        default=None,
                        help='A list of custom train and test files. ' +
                        'Ignored if -load-builtin or -load-custom is set. '
                        'The -reader parameter needs to be set.'
                        )

    parser.add_argument('-reader', type=str,
                        metavar='<reader>',
                        default=None,
                        help='A Reader to read the custom dataset. Example: ' +
                        '"Reader(line_format=\'user item rating timestamp\',' +
                        ' sep=\'\\t\')"'
                        )

    parser.add_argument('-n-folds', type=int, dest='n_folds',
                        metavar="<number of folds>",
                        default=5,
                        help='The number of folds for cross-validation. ' +
                        'Default is 5.'
                        )

    parser.add_argument('-seed', type=int,
                        metavar='<random seed>',
                        default=None,
                        help='The seed to use for RNG. ' +
                        'Default is the current system time.'
                        )

    parser.add_argument('--with-dump', dest='with_dump', action='store_true',
                        help='Dump the algorithm ' +
                        'results in a file (one file per fold). ' +
                        'Default is False.'
                        )

    parser.add_argument('-dump-dir', dest='dump_dir', type=str,
                        metavar='<dir>',
                        default=None,
                        help='Where to dump the files. Ignored if ' +
                        'with-dump is not set. Default is ' +
                        os.path.join(get_dataset_dir(), 'dumps/')
                        )

    parser.add_argument('--clean', dest='clean', action='store_true',
                        help='Remove the ' + get_dataset_dir() +
                        ' directory and exit.'
                        )

    parser.add_argument('-v', '--version', action='version',
                        version=__version__)

    args = parser.parse_args()

    if args.clean:
        folder = get_dataset_dir()
        shutil.rmtree(folder)
        print('Removed', folder)
        exit()

    # setup RNG
    rd.seed(args.seed)
    np.random.seed(args.seed)

    # setup algorithm
    params = eval(args.params)
    if args.algo is None:
        parser.error('No algorithm was specified.')
    algo = algo_choices[args.algo](**params)

    # setup dataset
    if args.load_custom is not None:  # load custom and split
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        data = Dataset.load_from_file(args.load_custom, reader=reader)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    elif args.folds_files is not None:  # load from files
        if args.reader is None:
            parser.error('-reader parameter is needed.')
        reader = eval(args.reader)
        folds_files = args.folds_files.split()
        folds_files = [(folds_files[i], folds_files[i + 1])
                       for i in range(0, len(folds_files) - 1, 2)]
        data = Dataset.load_from_folds(folds_files=folds_files, reader=reader)
        cv = PredefinedKFold()

    else:  # load builtin dataset and split
        data = Dataset.load_builtin(args.load_builtin)
        cv = KFold(n_splits=args.n_folds, random_state=args.seed)

    cross_validate(algo, data, cv=cv, verbose=True)
Ejemplo n.º 46
0
def test_NMF_parameters(u1_ml100k, pkf):
    """Ensure that all parameters are taken into account."""

    # The baseline against which to compare.
    algo = NMF(n_factors=1, n_epochs=1, random_state=1)
    rmse_default = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']

    # n_factors
    algo = NMF(n_factors=2, n_epochs=1, random_state=1)
    rmse_factors = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_factors

    # n_epochs
    algo = NMF(n_factors=1, n_epochs=2, random_state=1)
    rmse_n_epochs = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_n_epochs

    # biased
    algo = NMF(n_factors=1, n_epochs=1, biased=True, random_state=1)
    rmse_biased = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_biased

    # reg_pu
    algo = NMF(n_factors=1, n_epochs=1, reg_pu=1, random_state=1)
    rmse_reg_pu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_pu

    # reg_qi
    algo = NMF(n_factors=1, n_epochs=1, reg_qi=1, random_state=1)
    rmse_reg_qi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_qi

    # reg_bu
    algo = NMF(n_factors=1, n_epochs=1, reg_bu=1, biased=True, random_state=1)
    rmse_reg_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_bu

    # reg_bi
    algo = NMF(n_factors=1, n_epochs=1, reg_bi=1, biased=True, random_state=1)
    rmse_reg_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_reg_bi

    # lr_bu
    algo = NMF(n_factors=1, n_epochs=1, lr_bu=1, biased=True, random_state=1)
    rmse_lr_bu = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_bu

    # lr_bi
    algo = NMF(n_factors=1, n_epochs=1, lr_bi=1, biased=True, random_state=1)
    rmse_lr_bi = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_lr_bi

    # init_low
    algo = NMF(n_factors=1, n_epochs=1, init_low=.5, random_state=1)
    rmse_init_low = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_init_low

    # init_low
    with pytest.raises(ValueError):
        algo = NMF(n_factors=1, n_epochs=1, init_low=-1, random_state=1)

    # init_high
    algo = NMF(n_factors=1, n_epochs=1, init_high=.5, random_state=1)
    rmse_init_high = cross_validate(algo, u1_ml100k, ['rmse'], pkf)['test_rmse']
    assert rmse_default != rmse_init_high
Ejemplo n.º 47
0
from surprise import Dataset
from surprise.model_selection import cross_validate


class MyOwnAlgorithm(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def estimate(self, u, i):

        sum_means = self.trainset.global_mean
        div = 1

        if self.trainset.knows_user(u):
            sum_means += np.mean([r for (_, r) in self.trainset.ur[u]])
            div += 1
        if self.trainset.knows_item(i):
            sum_means += np.mean([r for (_, r) in self.trainset.ir[i]])
            div += 1

        return sum_means / div


data = Dataset.load_builtin('ml-100k')
algo = MyOwnAlgorithm()

cross_validate(algo, data, verbose=True)