Example #1
0
    def __build_model(self):
        model_path = '{}{}'.format(self.file_prefix, self.model_path)
        try:
            model = joblib.load(model_path)
            print('recommender exists, load it')
            return model
        except Exception as e:
            print('recommender does not exist, build new recommender')

            # load data

            # initialize KNN recommender
            algo = KNNWithMeans(k=50,
                                sim_options={
                                    'name': 'pearson_baseline',
                                    'user_based': False
                                })
            # train model
            algo.fit(self.trainset)
            # save model
            joblib.dump(algo, model_path)
            # validation
            test_pred = algo.test(self.testset)
            accuracy.rmse(test_pred)

            return algo
def svdpp(trainset, testset, predset):
    
    modelname = 'svdpp'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01)
    print('SVDpp Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
    
def baseline(trainset, testset, predset):
    
    modelname = 'baseline'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    bsl_options = { 'method': 'als',
                    'reg_i': 1.e-5,
                    'reg_u': 14.6,
                    'n_epochs': 10
                   }
    
    algo = BaselineOnly(bsl_options=bsl_options)
    print('Baseline Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
def svd(trainset, testset, predset):

    modelname = 'svd'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return

    algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1)
    print('SVD Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('   Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
Example #5
0
def collaborative_filter(id, new_words):
    ratings_dict = calc_collaborative_param(new_words, id)

    df = pd.DataFrame(ratings_dict)

    # A reader is still needed but only the rating_scale param is required.
    reader = Reader(rating_scale=(0.0, 5.0))
    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
    # define a cross-validation iterator
    kf = KFold(n_splits=3)

    algo = KNNBasic()

    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        kf_predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(kf_predictions, verbose=True)

    trainset = data.build_full_trainset()

    new_data = trainset.build_anti_testset()
    predictions = algo.test(new_data)

    top_n = get_top_n(predictions, n=3)

    with open('top_n.json', 'w') as fp:
        dump(top_n, fp, indent=4)

    return top_n
def rodar_modelo(data, teste_tamanho, sim_opcoes, k):
    treina, testa = train_test_split(data, teste_tamanho)
    knn = KNNBasic(k=k, sim_options=sim_opcoes)
    knn.fit(treina)
    knn_predicoes = knn.test(testa)
    accuracy.rmse(knn_predicoes)
    return knn
def train_trim_nmf(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        nmf = NMF(n_factors=k)
        for trainset, testset in kfold.split(data):
            nmf.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = nmf.test(p_testset)
            u_pred = nmf.test(u_testset)
            hv_pred = nmf.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("NMF with trim is finished!!")
    return rmse_list
    def evaluate_model(self, data, algo):

        raw_ratings = data.raw_ratings

        # A = 90% of the data, B = 10% of the data
        threshold = int(.9 * len(raw_ratings))
        A_raw_ratings = raw_ratings[:threshold]
        B_raw_ratings = raw_ratings[threshold:]

        data.raw_ratings = A_raw_ratings  # train data
        # retrain on the whole set A
        trainset = data.build_full_trainset()
        algo.fit(trainset)

        # Compute biased accuracy on A
        testset = trainset.build_testset()
        predictions = algo.test(testset)
        print('Biased accuracy on A,', end='   ')
        accuracy.rmse(predictions, verbose=True)
        accuracy.mae(predictions, verbose=True)
        print('len(predictions)')
        print(len(predictions))

        # Compute unbiased accuracy on B
        testset = data.construct_testset(
            B_raw_ratings)  # testset is now the set B
        predictions = algo.test(testset)
        print('Unbiased accuracy on B,', end=' ')
        accuracy.rmse(predictions, verbose=True)
        accuracy.mae(predictions, verbose=True)
        print('len(predictions)')
        print(len(predictions))
def surpriseSVD(movieLensDataPath='data_clean.txt'):
    ''' Basic use of the surprise SVD algorithm. '''
    ''' Params: movieLensDataPath is the path to the movielens data we're looking at. '''
    ''' Note: replace with cleaned data. '''
    ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.'''

    # Load the data as a pandas data frame, as reading from text didn't quite work at first.
    df = pd.read_csv(movieLensDataPath, sep="\t", header=None)
    df.columns = ["User Id", "Movie Id", "Rating"]

    # We need the rating scale.
    reader = Reader(rating_scale=(1, 5))

    # The columns are User Id, Movie Id, and Rating.
    data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader)
    # To fit to the SVD algorithm, we have to convert it to a trainset.
    algo = SVD()
    trainset = data.build_full_trainset()
    algo.fit(trainset)
    # U and V!
    algop = algo.pu
    algoq = algo.qi

    # Simple crossvalidation
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        # train and test algorithm.
        algo.fit(trainset)
        predictions = algo.test(testset)
        # Compute and print Root Mean Squared Error
        accuracy.rmse(predictions, verbose=True)
    # Return U (pu) and V (qi)
    return algop, algoq
Example #10
0
def DisplayGraphDelta(data) : 
    """
        Affichage du delta entre prédiction et réalité
    """
    # Créer un jeu de test et de train ( 25%, 75%)
    trainset, testset = train_test_split(data, test_size=.25)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result =[]
    for prediction in predictions:
        print(prediction)
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les prediction et la réalité
    print(len(result))
    plt.hist(result, 100)
    plt.show()
Example #11
0
def grid():
    raw_ratings = data.raw_ratings
    threshold = int(.9 * len(raw_ratings))
    A_raw_ratings = raw_ratings[:threshold]
    B_raw_ratings = raw_ratings[threshold:]

    data.raw_ratings = A_raw_ratings
    param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005]}
    grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3)
    grid_search.fit(data)
    algo = grid_search.best_estimator['rmse']

    # retrain on the whole set A
    trainset = data.build_full_trainset()
    algo.fit(trainset)

    # Compute biased accuracy on A
    predictions = algo.test(trainset.build_testset())
    print('Biased accuracy on A,', end='   ')
    accuracy.rmse(predictions)

    # Compute unbiased accuracy on B
    testset = data.construct_testset(B_raw_ratings)  # testset is now the set B
    predictions = algo.test(testset)
    print('Unbiased accuracy on B,', end=' ')
    accuracy.rmse(predictions)
Example #12
0
def main():
    # Charge movielens-100k dataset
    data = Dataset.load_builtin('ml-100k')

    # Créer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(data, test_size=.15)

    # Détermine l'algorithme utilisé
    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    result =[]
    for prediction in predictions:
        # Calcul le delta entre la prediction et la réalité
        result.append(prediction.r_ui - prediction.est)

    # Affiche l'histogramme du delta entre les predictions et la réalité
    plt.hist(result, 100)

    plt.show()
Example #13
0
def slope_one(trainset, testset, predset):
    
    modelname = 'slopeone'
    # Check if predictions already exist
    if is_already_predicted(modelname):
        return
    
    algo = SlopeOne()
    print('SlopeOne Model')
    algo.train(trainset)
    
    predictions = algo.test(trainset.build_testset())
    print('   RMSE on Train: ', accuracy.rmse(predictions, verbose=False))
    
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   RMSE on Test: ', rmse)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds, 'test')

    print('  Evaluate predicted ratings...')
    predictions = algo.test(predset)
    preds = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds[j] = pred.est
    save_predictions(modelname, rmse, preds)
def train_trim_knn(data, R):
    kfold = KFold(n_splits=10)
    sim_options = {'name': 'pearson'}
    rmse_list = [[], [], []]
    for k in range(2, 102, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        knn = KNNWithMeans(k=k, sim_options=sim_options)
        for trainset, testset in kfold.split(data):
            knn.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = knn.test(p_testset)
            u_pred = knn.test(u_testset)
            hv_pred = knn.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("KNN with trim is finished!!")
    return rmse_list
Example #15
0
def main():

    # Charge movielens-100k dataset
    movielens_ds = Dataset.load_builtin('ml-100k')

    # Creer un jeu de test et de train ( 15%, 85%)
    trainset, testset = train_test_split(movielens_ds, test_size=.15)

    algo = KNNWithMeans()

    # Train sur le jeu de donnée trainset
    algo.fit(trainset)
    # Prediction sur le jeu de donnée testset
    predictions = algo.test(testset)

    # Affiche le RMSE
    accuracy.rmse(predictions)

    #print(predictions)

    result = []
    for prediction in predictions:
        # Difference prediction et realite
        result.append(prediction.r_ui - prediction.est)

    # Histogramme du resultat
    plt.hist(result, 100)

    plt.show()
Example #16
0
def run_svd(dataset):


    # Load the movielens_hetesage-100k dataset (download it if needed),
    data = Dataset.load_builtin(dataset)

    # sample random trainset and testset
    # test set is made of 25% of the ratings.
    trainset, testset = train_test_split(data, test_size=.33)

    # We'll use the famous SVD algorithm.
    algo = SVD()

    # Train the algorithm on the trainset, and predict ratings for the testset
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Then compute RMSE
    accuracy.rmse(predictions)
    y_test = [item[2] for item in testset]
    preds = [pred[3] for pred in predictions]
    preds_round = np.rint(preds)
    rmse_round = np.sqrt(np.mean(np.square(np.array(preds_round - np.array(y_test)))))
    print(f'rmse_round {rmse_round}')
    utils.hist_plot(y_test, preds, preds_round)
Example #17
0
def baseline(trainset, testset):
    algo = BaselineOnly()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
def fit_rmse(algo, data):
    algo.fit(data.build_full_trainset())
    dev_pred = algo.test(dev_dat.build_full_trainset().build_testset())
    dev_rmse = accuracy.rmse(dev_pred, verbose = True)
    tr_rmse = accuracy.rmse(algo.test(data.build_full_trainset().build_testset()), 
                            verbose = True)
    print("rmse on dev_data: " , dev_rmse, "\n",
          "rmse on traning data: ", tr_rmse)
Example #19
0
def svdalgorithm(trainset, testset):
    algo = SVD()
    algo.fit(trainset)
    print("Predictions")
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    return(predictions)
def eval_model(model):
    kf = KFold(n_splits=3)
    for trainset, testset in kf.split(data):
        #训练并预测
        model.fit(trainset)
        predictions = model.test(testset)
        #计算RMSE
        accuracy.rmse(predictions, verbose=True)
Example #21
0
def func2():
    from surprise import SVD
    from surprise import Dataset
    from surprise import accuracy
    from surprise.model_selection import train_test_split

    data = Dataset.load_builtin('ml-100k')
    trainset, testset = train_test_split(data, test_size=.25)
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    def cross_validation(self, data, algo):
        # define a cross-validation iterator
        kf = KFold(n_splits=7, random_state=2)

        for trainset, testset in kf.split(data):
            # train and test algorithm.
            algo.fit(trainset)

            predictions = algo.test(testset)

            # Compute and print Root Mean Squared Error
            accuracy.rmse(predictions, verbose=True)
Example #23
0
def generate_svd_recommendation_df() -> pd.DataFrame:
    # Prepare input DataFrame and algorithm
    score_df = genearte_score_df()
    svd_data = MyDataSet(score_df)
    #Try SVD
    algo = SVD()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    #Try the NMF
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    algo = NMF()
    full_train_set = svd_data.build_full_trainset()
    test_set = full_train_set.build_anti_testset()
    # 5 fold validation
    score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    # Fitting the SVD
    algo.fit(full_train_set)
    predictions = algo.test(test_set)
    # Then compute RMSE
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    # Generate recommendation DataFrame
    recommendation_df_svd = get_top_n(predictions, n=5)
    #print (recommendation_df)
    
    
    
    #---------------------------------------------------
    # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise
    knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False)
    knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False)

    # Matrix Factorization Based Algorithms
    svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False)
    svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False)
    nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) 
    
    #Other Collaborative Filtering Algorithms
    slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False)
    coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
Example #24
0
def evaluate_model(model: AlgoBase, test_set: [(int, int, float)]) -> dict:
    global fit_time
    starts = time.time()
    predictions = model.test(test_set)
    #     print("It has been {0} seconds since the evaluation started".format(time.time() - starts))

    metrics_dict = {}
    metrics_dict['RMSE'] = accuracy.rmse(predictions, verbose=False)
    metrics_dict['MAE'] = accuracy.rmse(predictions, verbose=False)
    metrics_dict['test_time'] = time.time() - starts
    metrics_dict['fit_time'] = fit_time
    return metrics_dict
Example #25
0
def train_helper(algo, savename, trainset_cv, testset_cv, save=False):
    algo.fit(trainset_cv)
    print(f"{savename} on dev set:", end=" ")
    predictions_dev = algo.test(trainset_cv.build_testset())
    rmse(predictions_dev, verbose=True)

    print(f"{savename} on test set:", end=" ")
    predictions_test = algo.test(testset_cv)
    rmse(predictions_test, verbose=True)

    if save:
        dump.dump(f"models/dump_{savename}_dev", predictions_dev, algo)
        dump.dump(f"models/dump_{savename}_test", predictions_test, algo)
Example #26
0
def func6():
    from surprise import SVD
    from surprise import Dataset
    from surprise import accuracy
    from surprise.model_selection import KFold

    data = Dataset.load_builtin('ml-100k')
    kf = KFold(n_splits=3)
    algo = SVD()
    for trainset, testset in kf.split(data):
        algo.fit(trainset)
        predictions = algo.test(testset)
        accuracy.rmse(predictions, verbose=True)
    def prediction(self, data, algo):

        trainset = data.build_full_trainset()
        algo.fit(trainset)

        testset = trainset.build_anti_testset()
        predictionsAll = algo.test(testset)
        print('Accuracy on whole data set,', end='   ')
        accuracy.rmse(predictionsAll, verbose=True)
        print('len(predictions)')
        print(len(predictionsAll))

        return predictionsAll
Example #28
0
def test_rmse():
    """Tests for the RMSE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert rmse(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert rmse(predictions) == sqrt((0 - 2)**2 / 2)

    predictions = [pred(2, 0), pred(3, 4)]
    assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2)

    with pytest.raises(ValueError):
        rmse([])
Example #29
0
def svd_train_test_split():
    data = custom_pandas_100k()

    # Split data, training is 80% and test is 20%
    train_set, test_set = train_test_split(data, test_size=.20)
    algo = SVD()

    # Train on trainings et
    algo.fit(train_set)
    # Predict ratings for test set
    predictions = algo.test(test_set)

    # Compute RMSE
    accuracy.rmse(predictions)
Example #30
0
def test_rmse():
    """Tests for the RMSE function."""

    predictions = [pred(0, 0), pred(1, 1), pred(2, 2), pred(100, 100)]
    assert rmse(predictions) == 0

    predictions = [pred(0, 0), pred(0, 2)]
    assert rmse(predictions) == sqrt((0 - 2)**2 / 2)

    predictions = [pred(2, 0), pred(3, 4)]
    assert rmse(predictions) == sqrt(((2 - 0)**2 + (3 - 4)**2) / 2)

    with pytest.raises(ValueError):
        rmse([])
def knn_baseline_movie(train, test, ids, Xtest, Xids):
    """
    nearest neighbour approach using the movie baseline
    Argument : train, the trainset
               test, the testset
               ids, unknown ratings
               Xtest, predicted ratings for testset, to be used for final blending
               Xids, predicted ratings for unknown ratings, to be used for final blending
    """

    print('kNN Baseline Movie')
    bsl_option = {'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01}

    sim_option = {
        'name': 'pearson_baseline',
        'min_support': 1,
        'user_based': False
    }

    algo = KNNBaseline(k=100,
                       bsl_options=bsl_option,
                       sim_options=sim_option,
                       verbose=False)

    #Train algorithm on training set
    algo.fit(train)

    #Predict on train and compute RMSE
    predictions = algo.test(train.build_testset())
    print('   Training RMSE: ', accuracy.rmse(predictions, verbose=False))

    #Predict on test and compute RMSE
    predictions = algo.test(test)
    rmse = accuracy.rmse(predictions, verbose=False)
    print('   Test RMSE: ', rmse)

    preds_test = np.zeros(len(predictions))
    for j, pred in enumerate(predictions):
        preds_test[j] = pred.est

    #Predict unknown ratings
    preds_ids = []
    for i in range(len(ids[0])):
        pred = algo.predict(str(ids[0][i]), str(ids[1][i]))
        preds_ids.append(pred.est)

    Xtest.append(preds_test)
    Xids.append(preds_ids)
    return rmse, Xtest, Xids, preds_test, preds_ids
Example #32
0
def test_sanity_checks(u1_ml100k, pkf):
    """
    Basic sanity checks for all algorithms: check that RMSE stays the same.
    """

    expected_rmse = {
        BaselineOnly: 1.0268524031297395,
        KNNBasic: 1.1337265249554591,
        KNNWithMeans: 1.1043129441881696,
        KNNBaseline: 1.0700718041752253,
        KNNWithZScore: 1.11179436167853,
        SVD: 1.0077323320656948,
        SVDpp: 1.00284553561452,
        NMF: 1.0865370266372372,
        SlopeOne: 1.1559939123891685,
        CoClustering: 1.0841941385276614,
    }

    for klass, rmse in iteritems(expected_rmse):
        if klass in (SVD, SVDpp, NMF, CoClustering):
            algo = klass(random_state=0)
        else:
            algo = klass()
        trainset, testset = next(pkf.split(u1_ml100k))
        algo.fit(trainset)
        predictions = algo.test(testset)
        assert accuracy.rmse(predictions, verbose=False) == rmse
Example #33
0
"""
This module describes how to use the train_test_split() function.
"""

from __future__ import (absolute_import, division, print_function,
                        unicode_literals)

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import KFold


data = Dataset.load_builtin('ml-100k')

algo = SVD()

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

# We can also do this during a cross-validation procedure!
print('CV procedure:')

kf = KFold(n_splits=3)
for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)):
    print('fold number', i + 1)
    algo.fit(trainset_cv)

    print('On testset,', end='  ')
    predictions = algo.test(testset_cv)
    accuracy.rmse(predictions, verbose=True)

    print('On trainset,', end=' ')
    predictions = algo.test(trainset_cv.build_testset())
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')

# This time, we'll use the built-in reader.
reader = Reader('ml-100k')

# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]

data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5))
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)