Example #1
0
def grid_search(setNum):

    reader = Reader(rating_scale = (0,10))
    train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';')
    train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader)
    data = train_set.build_full_trainset()

    param_grid = {'n_factors': [200,400,600,800,1000]}
    gs = GridSearchCV(SVD, param_grid)
    gs.fit(data)
    ## best RMSE score
    print(gs.best_score['rmse'])

    ## combination of parameters that gave the best RMSE score
    print(gs.best_params['rmse'])
Example #2
0
reader_x = Reader(rating_scale = (lower_bound,upper_bound))
data = Dataset.load_from_df(df=dfx[['CustomerID','StockCode','Norm_Tot_Amnt']],reader=reader_x)


#for i in range(9):
#    print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0])

print 'difference in processed and pre-processed dataset = ',(data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0])

import time
start_time = time.time()


#param_grid = {'n_factors':[2,5,10,50],'n_epochs': [10,50,100], 'lr_bu': [0.1,0.01,0.001,0.0001],'lr_bi': [0.1,0.01,0.001,0.0001],'reg_bi': [0.1,0.01,0.001,0.0001],'reg_bu': [0.1,0.01,0.001,0.0001],'reg_qi': [0.1,0.01,0.001,0.0001],'reg_pu': [0.1,0.01,0.001,0.0001]}
param_grid = {'n_factors':[5,10,50,100],'n_epochs': [5,10,20,50,100], 'lr_all': [0.1,0.01,0.001],'reg_all': [0.1,0.01,0.001}
grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=1)

grid_search.fit(data)

print 'best RMSE score'
print(grid_search.best_score['rmse'])

print 'combination of parameters that gave the best RMSE score'
print(grid_search.best_params['rmse'])

print 'best MAE score'
print(grid_search.best_score['mae'])

print 'combination of parameters that gave the best MAE score'
print(grid_search.best_params['mae'])
Example #3
0
# In[ ]:

start = default_timer()

#grid of parameters taken into account
param_grid = {
    "n_factors": [50, 200, 700],
    "n_epochs": [50, 70, 150],
    "lr_all": [0.003, 0.008, 0.01],
    "reg_all": [0.0, 0.06, 0.08, 0.1]
}

grid = GridSearchCV(SVD,
                    param_grid=param_grid,
                    measures=["rmse"],
                    cv=kf,
                    n_jobs=4)
grid.fit(data)

stop = default_timer()

# In[ ]:

# the total time of execution
print("total time", stop - start)

# the optimal parameters found
print("optimal parameters", grid.best_params)

# the associated root mean squared error
Example #4
0
#%% Train knn with means

# 0.590432
#0.144686
cv_method    = RepeatedKFold(n_splits = 5, 
                             n_repeats = 5, 
                             random_state = 1)

#algo =  KNNBasic()

param_grid = {'k'     : [4, 8, 16, 32, 64],
              'min_k' : [1, 2, 3, 4]}

clf_knn = GridSearchCV(KNNBasic,
                   param_grid,
                   cv = cv_method)

clf_knn.fit(data)
results_df = pd.DataFrame.from_dict(clf_knn.cv_results)
results_df.sort_values('mean_test_rmse', inplace = True)
results_df.head(10)

#%% Train SVD


#0.587609
#0.134100

param_grid = {'n_factors'  : np.arange(25, 150, 25),
              'n_epochs'   : np.arange(5, 20, 5),
Example #5
0
algo =SVD(biased= False) 
res = cross_validate(algo, data, measures=['rmse', 'mae'], 
              cv= 3, return_train_measures=False ,
              verbose=False)
print(res )

# 网格搜索
# GrideSearchCV
from surprise.model_selection.search import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6] , 'biased' :[False]          }


gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

# We can now use the algorithm that yields the best rmse:
model = gs.best_estimator['rmse']
model.fit(x_train)

predictions = model.predict(x_test)
top_n = get_top_n(predictions , n=10)

# Print the recommended items for each user
Example #6
0
dataset = Dataset.load_from_df(df_ratings, reader)

"""## Split del dataset"""

trainset, testset = train_test_split(data=dataset, test_size=0.2, random_state=42)

"""## Model selection

In questo notebook si vanno ad utilizzare dei metodi di *filtering collaborativo*. Questi metodi fanno uso delle interazioni tra utenti ed item: Queste interazioni possono essere prese da uno storico o possono derivare da dei feedback implici/espliciti.

Viene utilizzato l'algortimo **SVD** che implemente la *matrice di fattorizzazione probabilistica*.

### **SVD**
"""

param_grid = {
   "n_epochs": [5, 10],
   "lr_all": [0.002, 0.004],
   "reg_all": [0.4, 0.5] 
}

gridCV = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5)
gridCV.fit(dataset)

gridCV.cv_results

"""Si illustrino delle valutazioni *offline*, ovvero, l'utente non è direttamente coinvolto nella valutazione del sistema di raccomandazione."""

gridCV.best_score

predictor = gridCV.best_estimator['rmse']
Example #7
0
def part3():
    file_path = 'DMA_project2_team%02d_part2_UIR.csv' % team
    reader = Reader(line_format='user item rating',
                    sep=',',
                    rating_scale=(1, 10),
                    skip_lines=1)
    data = Dataset.load_from_file(file_path, reader=reader)

    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()

    # TODO: Requirement 3-2. User-based Recommendation
    uid_list = [
        'ffffbe8d854a4a5a8ab1a381224f5b80', 'ffe2f26d5c174e13b565d026e1d8c503',
        'ffdccaff893246519b64d76c3561d8c7', 'ffdb001850984ce69c5f91360ac16e9c',
        'ffca7b070c9d41e98eba01d23a920d52'
    ]
    # TODO - set algorithm for 3-2-1
    algo = surprise.KNNBasic(k=40,
                             min_k=1,
                             sim_options={
                                 'name': 'cosine',
                                 'user_based': True
                             },
                             verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-2-1.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-2-2
    algo = surprise.KNNWithMeans(k=40,
                                 min_k=1,
                                 sim_options={
                                     'name': 'pearson',
                                     'user_based': True
                                 },
                                 verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-2-2.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - 3-2-3. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters = {
        'k': [30, 40, 50],
        'min_k': [1],
        'sim_options': {
            'name': ['pearson', 'cosine'],
            'user_based': [True]
        }
    }

    # Select the best algo with grid search.
    print('Grid Search for user based model...')
    grid_KNNBasic = GridSearchCV(surprise.KNNBasic,
                                 measures=['rmse'],
                                 param_grid=parameters,
                                 cv=kfold)
    grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans,
                                     measures=['rmse'],
                                     param_grid=parameters,
                                     cv=kfold)

    grid_KNNBasic.fit(data)
    grid_KNNWithMeans.fit(data)

    best_KNNBasic_score = grid_KNNBasic.best_score['rmse']
    best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse']

    if best_KNNBasic_score < best_KNNWithMeans_score:
        algo_name = 'KNNBasic'
        best_algo_ub = grid_KNNBasic.best_estimator['rmse']
        with_parameters = grid_KNNBasic.best_params['rmse']
        score = best_KNNBasic_score

    else:
        algo_name = 'KNNWithMeans'
        best_algo_ub = grid_KNNWithMeans.best_estimator['rmse']
        with_parameters = grid_KNNWithMeans.best_params['rmse']
        score = best_KNNWithMeans_score

    print('The best UB algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)

    # TODO: Requirement 3-3. Item-based Recommendation
    iid_list = ['art', 'teaching', 'career', 'college', 'medicine']
    # TODO - set algorithm for 3-3-1
    algo = surprise.KNNBasic(k=40,
                             min_k=1,
                             sim_options={
                                 'name': 'cosine',
                                 'user_based': False
                             },
                             verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, iid_list, n=10, user_based=False)
    with open('3-3-1.txt', 'w') as f:
        for iid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('Item ID %s top-10 results\n' % iid)
            for uid, score in ratings:
                f.write('User ID %s\tscore %s\n' % (uid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-3-2
    algo = surprise.KNNWithMeans(k=40,
                                 min_k=1,
                                 sim_options={
                                     'name': 'pearson',
                                     'user_based': False
                                 },
                                 verbose=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, iid_list, n=10, user_based=False)
    with open('3-3-2.txt', 'w') as f:
        for iid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('Item ID %s top-10 results\n' % iid)
            for uid, score in ratings:
                f.write('User ID %s\tscore %s\n' % (uid, str(score)))
            f.write('\n')

    # TODO - 3-3-3. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters = {
        'k': [30, 40, 50],
        'min_k': [1],
        'sim_options': {
            'name': ['pearson', 'cosine'],
            'user_based': [False]
        }
    }

    # Select the best algo with grid search.
    print('Grid Search for item based model...')
    grid_KNNBasic = GridSearchCV(surprise.KNNBasic,
                                 measures=['rmse'],
                                 param_grid=parameters,
                                 cv=kfold)
    grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans,
                                     measures=['rmse'],
                                     param_grid=parameters,
                                     cv=kfold)

    grid_KNNBasic.fit(data)
    grid_KNNWithMeans.fit(data)

    best_KNNBasic_score = grid_KNNBasic.best_score['rmse']
    best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse']

    if best_KNNBasic_score < best_KNNWithMeans_score:
        algo_name = 'KNNBasic'
        best_algo_ub = grid_KNNBasic.best_estimator['rmse']
        with_parameters = grid_KNNBasic.best_params['rmse']
        score = best_KNNBasic_score
    else:
        algo_name = 'KNNWithMeans'
        best_algo_ub = grid_KNNWithMeans.best_estimator['rmse']
        with_parameters = grid_KNNWithMeans.best_params['rmse']
        score = best_KNNWithMeans_score

    print('The best IB algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)

    # TODO: Requirement 3-4. Matrix-factorization Recommendation
    # TODO - set algorithm for 3-4-1
    algo = surprise.SVD(n_factors=100, n_epochs=50, biased=False)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-1.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-2
    algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-2.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-3
    algo = surprise.SVDpp(n_factors=100, n_epochs=50)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-3.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - set algorithm for 3-4-4
    algo = surprise.SVDpp(n_factors=100, n_epochs=100)
    algo.fit(trainset)
    results = get_top_n(algo, testset, uid_list, n=10, user_based=True)
    with open('3-4-4.txt', 'w') as f:
        for uid, ratings in sorted(results.items(), key=lambda x: x[0]):
            f.write('User ID %s top-10 results\n' % uid)
            for iid, score in ratings:
                f.write('Item ID %s\tscore %s\n' % (iid, str(score)))
            f.write('\n')

    # TODO - 3-4-5. Best Model
    kfold = KFold(n_splits=5, random_state=0)
    parameters_SVD = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 50, 100, 200],
        'biased': [True, False]
    }
    grid_SVD = GridSearchCV(surprise.SVD,
                            measures=['rmse'],
                            param_grid=parameters_SVD,
                            cv=kfold)
    parameters_SVDpp = {
        'n_factors': [50, 100, 200],
        'n_epochs': [10, 50, 100, 200]
    }
    grid_SVDpp = GridSearchCV(surprise.SVDpp,
                              measures=['rmse'],
                              param_grid=parameters_SVDpp,
                              cv=kfold)

    grid_SVD.fit(data)
    grid_SVDpp.fit(data)

    best_SVD_score = grid_SVD.best_score['rmse']
    best_SVDpp_score = grid_SVDpp.best_score['rmse']

    if best_SVD_score < best_SVDpp_score:
        algo_name = 'SVD'
        best_algo_mf = grid_SVD.best_estimator['rmse']
        with_parameters = grid_SVD.best_params['rmse']
        score = best_SVD_score

    else:
        algo_name = 'SVDpp'
        best_algo_mf = grid_SVDpp.best_estimator['rmse']
        with_parameters = grid_SVDpp.best_params['rmse']
        score = best_SVDpp_score

    print('The best MF algorithm is', algo_name, 'with', with_parameters,
          '\nscore:', score)