def grid_search(setNum): reader = Reader(rating_scale = (0,10)) train = pd.read_csv('../data/train_'+str(setNum)+'.csv', sep = ';') train_set = Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']], reader=reader) data = train_set.build_full_trainset() param_grid = {'n_factors': [200,400,600,800,1000]} gs = GridSearchCV(SVD, param_grid) gs.fit(data) ## best RMSE score print(gs.best_score['rmse']) ## combination of parameters that gave the best RMSE score print(gs.best_params['rmse'])
reader_x = Reader(rating_scale = (lower_bound,upper_bound)) data = Dataset.load_from_df(df=dfx[['CustomerID','StockCode','Norm_Tot_Amnt']],reader=reader_x) #for i in range(9): # print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0]) print 'difference in processed and pre-processed dataset = ',(data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0]) import time start_time = time.time() #param_grid = {'n_factors':[2,5,10,50],'n_epochs': [10,50,100], 'lr_bu': [0.1,0.01,0.001,0.0001],'lr_bi': [0.1,0.01,0.001,0.0001],'reg_bi': [0.1,0.01,0.001,0.0001],'reg_bu': [0.1,0.01,0.001,0.0001],'reg_qi': [0.1,0.01,0.001,0.0001],'reg_pu': [0.1,0.01,0.001,0.0001]} param_grid = {'n_factors':[5,10,50,100],'n_epochs': [5,10,20,50,100], 'lr_all': [0.1,0.01,0.001],'reg_all': [0.1,0.01,0.001} grid_search = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=1) grid_search.fit(data) print 'best RMSE score' print(grid_search.best_score['rmse']) print 'combination of parameters that gave the best RMSE score' print(grid_search.best_params['rmse']) print 'best MAE score' print(grid_search.best_score['mae']) print 'combination of parameters that gave the best MAE score' print(grid_search.best_params['mae'])
# In[ ]: start = default_timer() #grid of parameters taken into account param_grid = { "n_factors": [50, 200, 700], "n_epochs": [50, 70, 150], "lr_all": [0.003, 0.008, 0.01], "reg_all": [0.0, 0.06, 0.08, 0.1] } grid = GridSearchCV(SVD, param_grid=param_grid, measures=["rmse"], cv=kf, n_jobs=4) grid.fit(data) stop = default_timer() # In[ ]: # the total time of execution print("total time", stop - start) # the optimal parameters found print("optimal parameters", grid.best_params) # the associated root mean squared error
#%% Train knn with means # 0.590432 #0.144686 cv_method = RepeatedKFold(n_splits = 5, n_repeats = 5, random_state = 1) #algo = KNNBasic() param_grid = {'k' : [4, 8, 16, 32, 64], 'min_k' : [1, 2, 3, 4]} clf_knn = GridSearchCV(KNNBasic, param_grid, cv = cv_method) clf_knn.fit(data) results_df = pd.DataFrame.from_dict(clf_knn.cv_results) results_df.sort_values('mean_test_rmse', inplace = True) results_df.head(10) #%% Train SVD #0.587609 #0.134100 param_grid = {'n_factors' : np.arange(25, 150, 25), 'n_epochs' : np.arange(5, 20, 5),
algo =SVD(biased= False) res = cross_validate(algo, data, measures=['rmse', 'mae'], cv= 3, return_train_measures=False , verbose=False) print(res ) # 网格搜索 # GrideSearchCV from surprise.model_selection.search import GridSearchCV param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] , 'biased' :[False] } gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # We can now use the algorithm that yields the best rmse: model = gs.best_estimator['rmse'] model.fit(x_train) predictions = model.predict(x_test) top_n = get_top_n(predictions , n=10) # Print the recommended items for each user
dataset = Dataset.load_from_df(df_ratings, reader) """## Split del dataset""" trainset, testset = train_test_split(data=dataset, test_size=0.2, random_state=42) """## Model selection In questo notebook si vanno ad utilizzare dei metodi di *filtering collaborativo*. Questi metodi fanno uso delle interazioni tra utenti ed item: Queste interazioni possono essere prese da uno storico o possono derivare da dei feedback implici/espliciti. Viene utilizzato l'algortimo **SVD** che implemente la *matrice di fattorizzazione probabilistica*. ### **SVD** """ param_grid = { "n_epochs": [5, 10], "lr_all": [0.002, 0.004], "reg_all": [0.4, 0.5] } gridCV = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5) gridCV.fit(dataset) gridCV.cv_results """Si illustrino delle valutazioni *offline*, ovvero, l'utente non è direttamente coinvolto nella valutazione del sistema di raccomandazione.""" gridCV.best_score predictor = gridCV.best_estimator['rmse']
def part3(): file_path = 'DMA_project2_team%02d_part2_UIR.csv' % team reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10), skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() # TODO: Requirement 3-2. User-based Recommendation uid_list = [ 'ffffbe8d854a4a5a8ab1a381224f5b80', 'ffe2f26d5c174e13b565d026e1d8c503', 'ffdccaff893246519b64d76c3561d8c7', 'ffdb001850984ce69c5f91360ac16e9c', 'ffca7b070c9d41e98eba01d23a920d52' ] # TODO - set algorithm for 3-2-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-2-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': True }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-2-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-2-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [True] } } # Select the best algo with grid search. print('Grid Search for user based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best UB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-3. Item-based Recommendation iid_list = ['art', 'teaching', 'career', 'college', 'medicine'] # TODO - set algorithm for 3-3-1 algo = surprise.KNNBasic(k=40, min_k=1, sim_options={ 'name': 'cosine', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-1.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - set algorithm for 3-3-2 algo = surprise.KNNWithMeans(k=40, min_k=1, sim_options={ 'name': 'pearson', 'user_based': False }, verbose=True) algo.fit(trainset) results = get_top_n(algo, testset, iid_list, n=10, user_based=False) with open('3-3-2.txt', 'w') as f: for iid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('Item ID %s top-10 results\n' % iid) for uid, score in ratings: f.write('User ID %s\tscore %s\n' % (uid, str(score))) f.write('\n') # TODO - 3-3-3. Best Model kfold = KFold(n_splits=5, random_state=0) parameters = { 'k': [30, 40, 50], 'min_k': [1], 'sim_options': { 'name': ['pearson', 'cosine'], 'user_based': [False] } } # Select the best algo with grid search. print('Grid Search for item based model...') grid_KNNBasic = GridSearchCV(surprise.KNNBasic, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNWithMeans = GridSearchCV(surprise.KNNWithMeans, measures=['rmse'], param_grid=parameters, cv=kfold) grid_KNNBasic.fit(data) grid_KNNWithMeans.fit(data) best_KNNBasic_score = grid_KNNBasic.best_score['rmse'] best_KNNWithMeans_score = grid_KNNWithMeans.best_score['rmse'] if best_KNNBasic_score < best_KNNWithMeans_score: algo_name = 'KNNBasic' best_algo_ub = grid_KNNBasic.best_estimator['rmse'] with_parameters = grid_KNNBasic.best_params['rmse'] score = best_KNNBasic_score else: algo_name = 'KNNWithMeans' best_algo_ub = grid_KNNWithMeans.best_estimator['rmse'] with_parameters = grid_KNNWithMeans.best_params['rmse'] score = best_KNNWithMeans_score print('The best IB algorithm is', algo_name, 'with', with_parameters, '\nscore:', score) # TODO: Requirement 3-4. Matrix-factorization Recommendation # TODO - set algorithm for 3-4-1 algo = surprise.SVD(n_factors=100, n_epochs=50, biased=False) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-1.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-2 algo = surprise.SVD(n_factors=200, n_epochs=100, biased=True) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-2.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-3 algo = surprise.SVDpp(n_factors=100, n_epochs=50) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-3.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - set algorithm for 3-4-4 algo = surprise.SVDpp(n_factors=100, n_epochs=100) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('3-4-4.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: x[0]): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 3-4-5. Best Model kfold = KFold(n_splits=5, random_state=0) parameters_SVD = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200], 'biased': [True, False] } grid_SVD = GridSearchCV(surprise.SVD, measures=['rmse'], param_grid=parameters_SVD, cv=kfold) parameters_SVDpp = { 'n_factors': [50, 100, 200], 'n_epochs': [10, 50, 100, 200] } grid_SVDpp = GridSearchCV(surprise.SVDpp, measures=['rmse'], param_grid=parameters_SVDpp, cv=kfold) grid_SVD.fit(data) grid_SVDpp.fit(data) best_SVD_score = grid_SVD.best_score['rmse'] best_SVDpp_score = grid_SVDpp.best_score['rmse'] if best_SVD_score < best_SVDpp_score: algo_name = 'SVD' best_algo_mf = grid_SVD.best_estimator['rmse'] with_parameters = grid_SVD.best_params['rmse'] score = best_SVD_score else: algo_name = 'SVDpp' best_algo_mf = grid_SVDpp.best_estimator['rmse'] with_parameters = grid_SVDpp.best_params['rmse'] score = best_SVDpp_score print('The best MF algorithm is', algo_name, 'with', with_parameters, '\nscore:', score)