def co_clustering(): print('Algoritmo CoClustering...') print('Que data desea utilizar') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') sys.setdefaultencoding('latin-1') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS file_path_corregido = configuration.FILE_PATH_WORDPRESS_CORREGIDA util.corregir_csv(file_path, file_path_corregido, sep="|") reader = Reader(line_format='user item rating', sep='|') data = Dataset.load_from_file(file_path_corregido, reader=reader) data.split(n_folds=10) algo = CoClustering() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def surpriseTesting(): """scikit-surprise library testing""" # Load the movielens-100k dataset (download it if needed), # and split it into 3 folds for cross-validation. data = surprise.Dataset.load_builtin('ml-100k') # reader = surprise.Reader(line_format='user item rating', sep=',') # data = Dataset.load_from_file('temp.csv', reader=reader) trainSet = data.build_full_trainset() data.split(n_folds=3) for rating in data.build_full_trainset().all_ratings(): print(rating) print(trainSet.n_items) algo = SVD() # algo = KNNBasic() algo.fit(trainSet) # Evaluate performances of our algorithm on the dataset. perf = surprise.evaluate(algo, data, measures=['RMSE', 'MAE']) surprise.print_perf(perf) uid = str( 196) # raw user id (as in the ratings file). They are **strings**! iid = str( 242) # raw item id (as in the ratings file). They are **strings**! # get a prediction for specific users and items. pred = algo.predict(uid, iid, r_ui=-1, verbose=True) print(pred.est)
def ibcf_eval(co_pe): kfold = input("Enter number of folds required to Evaluate:") reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) splitter(kfold, df) # SIMILARITY & ALGORITHM DEFINING sim_op = {'name': co_pe, 'user_based': False} algo = KNNBasic(sim_options=sim_op) # RESPONSIBLE TO EXECUTE DATA SPLITS MENTIONED IN STEP 4 start = time.time() perf = evaluate( algo, df, measures=['RMSE', 'MAE'], ) end = time.time() print_perf(perf) print "\nTotal Time elapsed =", (end - start) print "Average time per fold =", (end - start) / kfold, "\n" return perf
def batchrunSVDpp(data, al, folds): ''' define a function to run batches of data Args: data: data file name in string. al: algorithm name in string. folds: split the data into x folds for cross-validation, interger Returns: None ''' #load the data with given data format print "load data..." data = Dataset.load_from_file(path + data, reader=reader) #split the data into x folds for cross-validation. print "Split data...." data.split(n_folds=folds) # We'll use the famous SVDpp algorithm. if al == 'SVDpp': algo = SVDpp() elif al == 'Base': algo = BaselineOnly(bsl_options=bsl_options) # Evaluate performances of the algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def knn(data): data.split(n_folds=3) # We'll use the famous KNNBasic algorithm. knn = KNNBasic() # Evaluate performances of our algorithm on the dataset. perf = evaluate(knn, data, measures=['RMSE', 'MAE']) print_perf(perf)
def svd_pp(): print('Algoritmo Baseline Only...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=10) algo = SVDpp() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def ubcf_eval(co_pe): kfold = input("Enter number of folds required to Evaluate:") reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) splitter(kfold,df) # SIMILARITY & ALGORITHM DEFINING sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) # RESPONSIBLE TO EXECUTE DATA SPLITS MENTIONED IN STEP 4 start = time.time() perf = evaluate(algo, df, measures=['RMSE', 'MAE'], ) end = time.time() print_perf(perf) print "\nTotal Time elapsed =", (end - start) print "Average time per fold =", (end - start)/kfold, "\n" print perf ds = pd.read_csv("pred_matrix-full_ubcf.csv") confusion_matrix = np.matrix(ds) FP = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix) FN = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix) TP = np.diag(confusion_matrix) TN = confusion_matrix.sum() - (FP + FN + TP) # Sensitivity, hit rate, recall, or true positive rate TPR = TP / (TP + FN) # Specificity or true negative rate TNR = TN / (TN + FP) # Precision or positive predictive value PPV = TP / (TP + FP) # Negative predictive value NPV = TN / (TN + FN) # Fall out or false positive rate FPR = FP / (FP + TN) # False negative rate FNR = FN / (TP + FN) # False discovery rate FDR = FP / (TP + FP) # Overall accuracy ACC = (TP + TN) / (TP + FP + FN + TN) print "\nTrue Positive:\n", TP, "\n\nTrue Negative\n", TN, "\n\nFalse Positive\n", FP, "\n\nFalse Negative\n", FN print "-" * 30 print "\nTrue Postive Ratio =", TPR, "\n\nFalse Positive Ratio =", FPR print "-" * 30 print "*" * 20 print confusion_matrix print "Accuracy with current Algorithm", algo, "is ", ACC.mean(axis=0)
def q7(): file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=3) algo = NMF() perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def IBCFpearson(): file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=3) algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': False}) perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def runSurprise(algo, train, test, algo_string, n_folds=5, writeCSV=False, file_name="result.csv"): """ Run the fitting procedure on the training data. Write the result for the test data in its "Result" data field. Args: algo: Surprise algorithm (SVD, SVDpp, NMF, etc) train (Panda DataFrame): training data test (Panda dDtaFrame): test data algo_string (string): printable name of the algorithm n_folds (int): Number of k-folds writeCSV (bool): set to True to write a .csv submission file_name (string): name for the .csv file """ if writeCSV: sub = datahelper.load_submission() df = pd.DataFrame(train) # A reader is needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['User', 'Item', 'Rating']], reader) random.seed(42) data.split(n_folds=n_folds) # Evaluate performances of our algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE']) print_perf(perf) for index, row in test.iterrows(): test.at[index, "Result"] = algo.estimate(row['User'] - 1, row['Item'] - 1) if writeCSV: file_out = open(file_name, 'w') file_out.truncate() file_out.write('Id,Prediction\n') for index, row in sub.iterrows(): file_out.write("r{us}_c{mo},{res}\n".format(us=row['User'], mo=row['Item'], res=algo.estimate( row['User'] - 1, row['Item'] - 1))) file_out.close()
def surprise_cross_validate(algo, data, *options): """ 3-Fold cross-validation on surprise recommendation model. Args: algo: instansitated recommender model data: surprise dataframe *options: additional parameter options to gridsearch on Returns: Mean RMSE of 3-Fold cross-validated model. """ perf = evaluate(algo, data, measures=['RMSE']) print_perf(perf)
def number15(): data.split(n_folds=3) k_num = 1 ks = [] ubcf_rmses = [] ibcf_rmses = [] while k_num <= 101: ubcf_msd_algo = KNNBasic(k=k_num, sim_options={ 'name': 'MSD', 'user_based': True }) ubcf_perf = evaluate(ubcf_msd_algo, data, measures=['RMSE']) print_perf(ubcf_perf) for key, value in ubcf_perf.items(): mean = 0 for v in value: mean = mean + v print(mean) mean = mean / 3 print(mean) ubcf_rmses.append(mean) ibcf_msd_algo = KNNBasic(k=k_num, sim_options={ 'name': 'MSD', 'user_based': False }) ibcf_perf = evaluate(ibcf_msd_algo, data, measures=['RMSE']) print_perf(ibcf_perf) for key, value in ibcf_perf.items(): mean = 0 for v in value: mean = mean + v print(mean) mean = mean / 3 print(mean) ibcf_rmses.append(mean) print(k_num) ks.append(k_num) k_num += 10 plt.bar(ks, ubcf_rmses) plt.show() plt.bar(ks, ibcf_rmses) plt.show()
def knn_baseline(): print('Algoritmo KNN Baseline...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() #Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=10) """Segmento que utiliza KNN para el analisis: 'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion 'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion. Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones 'sim_options' son las opciones de similitud que utiliza el knn 'bsl_options' configuracion de las estimaciones de base""" k = 40 min_k = 1 sim_options = { 'name': 'pearson_baseline', 'user_based': 0 # no shrinkage } bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo = KNNBaseline(k=k, min_k=k, sim_options=sim_options, bsl_options=bsl_options) perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def number12(): data.split(n_folds=3) svd_algo = SVD() perf = evaluate(svd_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) pmf_algo = SVD(biased=False) perf = evaluate(pmf_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) nmf_algo = NMF() perf = evaluate(nmf_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ubcf_algo = KNNBasic(sim_options={'user_based': True}) perf = evaluate(ubcf_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ibcf_algo = KNNBasic(sim_options={'user_based': False}) perf = evaluate(ibcf_algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data3Folds = Dataset.load_from_file(file_path, reader=reader) data3Folds.split(n_folds=3) # # 3-Folds Comparison # if threeFolds == True: print('SVD') algoSVD = SVD() start_time = time.time() perfSVD = evaluate(algoSVD,data3Folds,measures=['RMSE','MAE']) end_time = time.time() print_perf(perfSVD) print(end_time - start_time, '\n\n') #PMF algoPMF = SVD(biased=False) start_time = time.time() perfPMF = evaluate(algoPMF,data3Folds,measures=['RMSE','MAE']) end_time = time.time() print_perf(perfPMF) print(end_time - start_time, '\n\n') print('NMF') algoNMF = NMF() start_time = time.time()
def evaluate(self): print_perf(self.metrics(None))
data=data.merge(activity_count, on=['user', 'hotel'], how='left') data['browse']=data.browse.fillna(0) data=data[['user', 'hotel', 'browse']] # tentatively CV test for some algorithms reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data, reader) data_cv=data data_cv.split(n_folds=5) # SVD test svd = SVD() perf = evaluate(svd, data, measures=['RMSE']) print_perf(perf) # MSE 0.052 param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005], 'reg_all': [0.05, 0.1, 0.5]} gs = GridSearch(SVD, param_svd, measures=['RMSE']) gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1 # Co-clustering test coc=CoClustering() perf = evaluate(coc, data, measures=['RMSE']) print_perf(perf) # MSE 0.053 param_svd = {'n_cltr_u': [3, 5, 7], 'n_cltr_i': [3, 5, 7], 'n_epochs': [10, 20]} gs = GridSearch(CoClustering, param_svd, measures=['RMSE']) gs.evaluate(data_cv) # generally worse than SVD here, especially for larger cluster numbers
#file_path = os.path.expanduser('restaurant_ratings') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file('restaurant_ratings.txt', reader=reader) data.split(n_folds=3) #Starting dataframe to store needed values df = pd.DataFrame([], index=[0, 1, 2, 3, 4, 5, 6, 7], columns=[ 'Algorithm', 'RMSE Fold 1', 'RMSE Fold 2', 'RMSE Fold 3', 'RMSE Mean', 'MAE Fold 1', 'MAE Fold 2', 'MAE Fold 3', 'MAE Mean' ]) ''' #SVD algorithm algo = SVD() perf = evaluate(algo,data,measures=['RMSE','MAE']) print_perf(perf) setDF(perf,'SVD',0) print '\n' #PMF algorithm algo = SVD(biased=False) perf = evaluate(algo,data,measures=['RMSE','MAE']) print_perf(perf) setDF(perf,'PMF',1) print '\n' #NMF algorithm
import os # 指定文件所在路径 file_path = os.path.expanduser('Surprise.csv') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating', sep=',') # 加载数据 data = Dataset.load_from_file(file_path, reader=reader) #data = Dataset.load_builtin('ml-100k') ### 使用NormalPredictor from surprise import NormalPredictor algo = NormalPredictor() perf = cross_validate(algo, data, measures=['RMSE', 'MAE', 'FCP'], cv=3) print_perf(perf) ### 使用BaselineOnly from surprise import BaselineOnly algo = BaselineOnly() perf = cross_validate(algo, data, measures=['RMSE', 'MAE', 'FCP'], cv=3) print_perf(perf) ### 使用基础版协同过滤 from surprise import KNNBasic, evaluate algo = KNNBasic() perf = cross_validate(algo, data, measures=['RMSE', 'MAE', 'FCP'], cv=3) print_perf(perf)
def surprise_algorithms_print_perf(): print('Surprise Algorithms (Tabla de resultados finales)...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # BaselineOnly algo_normal_predictor = NormalPredictor() perf_normal_predictor = evaluate(algo_normal_predictor, data, measures=['RMSE', 'MAE'], verbose=False) # SVD algo_svd = SVD() perf_svd = evaluate(algo_svd, data, measures=['RMSE', 'MAE'], verbose=False) # BaselineOnly algo_baseline_only = BaselineOnly() perf_baseline_only = evaluate(algo_baseline_only, data, measures=['RMSE', 'MAE'], verbose=False) # SVDpp algo_svdpp = SVDpp() perf_svdpp = evaluate(algo_svdpp, data, measures=['RMSE', 'MAE'], verbose=False) # NMF algo_nmf = NMF() perf_nmf = evaluate(algo_nmf, data, measures=['RMSE', 'MAE'], verbose=False) # SlopeOne algo_slope_one = SlopeOne() perf_slope_one = evaluate(algo_slope_one, data, measures=['RMSE', 'MAE'], verbose=False) # CoClustering algo_coclustering = CoClustering() perf_coclustering = evaluate(algo_coclustering, data, measures=['RMSE', 'MAE'], verbose=False) """Segmento que utiliza KNN para el analisis: 'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion 'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion. Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones 'sim_options' son las opciones de similitud que utiliza el knn 'bsl_options' configuracion de las estimaciones de base""" k = 40 min_k = 1 sim_options = { 'name': 'pearson_baseline', 'user_based': 0 # no shrinkage } bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_knn_basic = KNNBasic(k=k, min_k=k, sim_options=sim_options) perf_knn_basic = evaluate(algo_knn_basic, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_with_means = KNNWithMeans(k=k, min_k=k, sim_options=sim_options) perf_knn_with_means = evaluate(algo_knn_with_means, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_base_line = KNNBaseline(k=k, min_k=k, sim_options=sim_options, bsl_options=bsl_options) perf_knn_base_line = evaluate(algo_knn_base_line, data, measures=['RMSE', 'MAE'], verbose=False) """Imprimiendo resultados de los algoritmos""" print('') print('Printing results from algorithms...') print('- Normal predictor') print_perf(perf_normal_predictor) print('') print('- Normal SVD') print_perf(perf_svd) print('') print('- Normal Baseline Only') print_perf(perf_baseline_only) print('') print('- Normal SVD++') print_perf(perf_svdpp) print('') print('- Normal NMF') print_perf(perf_nmf) print('') print('- Normal Slope One') print_perf(perf_slope_one) print('') print('- Normal Co-Clustering') print_perf(perf_coclustering) print('') print('- Normal KNN Basic') print_perf(perf_knn_basic) print('') print('- Normal KNN With Means') print_perf(perf_knn_with_means) print('') print('- Normal KNN Base Line') print_perf(perf_knn_base_line)
import pandas as pd from surprise import prediction_algorithms as pa from surprise import Dataset, Reader, GridSearch from surprise import evaluate, print_perf import datetime data = pd.read_csv('./movielens_small/ratings.csv') df = pd.DataFrame(data) df.drop('timestamp', axis=1, inplace=True) print df.head() reader = Reader(rating_scale=(1, 5)) dataset = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader) dataset.split(n_folds=5) """ #Sample Run algo = pa.KNNBasic(k=10, min_k=5) perf = evaluate(algo, dataset, measures=['MAE', 'RMSE', 'FCP']) print_perf(perf) """ similarities = ['cosine', 'msd', 'pearson', 'pearson_baseline'] user_based = [True, False] start_time = ('Timestamp: {:%Y-%b-%d %H:%M:%S}'.format( datetime.datetime.now())) sim_options = {'name': similarities, 'user_based': user_based} param_grid = { 'k': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100], 'min_k': [5], 'sim_options': sim_options
from surprise import SVD from surprise import Dataset, print_perf from surprise.model_selection import cross_validate # 默认载入movielens数据集 data = Dataset.load_builtin('ml-100k') algo = SVD() # 在数据集上测试一下效果 perf = cross_validate(algo, data, measures=['RMSE'], cv=3)# RMSE(均方根误差) #输出结果 print_perf(perf)
def eval(self): # Evaluate performances of our algorithm on the dataset. perf = evaluate(self.svd, self.data, measures=['RMSE']) print_perf(perf)
from surprise import KNNBasic from surprise import Dataset from surprise import evaluate, print_perf from surprise import Reader import os #load data from a file file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=3) algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': True}) perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def number14(): data.split(n_folds=3) ubcf_msd_algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': True}) perf = evaluate(ubcf_msd_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ubcf_cosine_algo = KNNBasic(sim_options={ 'name': 'cosine', 'user_based': True }) perf = evaluate(ubcf_cosine_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ubcf_pearson_algo = KNNBasic(sim_options={ 'name': 'pearson', 'user_based': True }) perf = evaluate(ubcf_pearson_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ibcf_msd_algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': False}) perf = evaluate(ibcf_msd_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ibcf_cosine_algo = KNNBasic(sim_options={ 'name': 'cosine', 'user_based': False }) perf = evaluate(ibcf_cosine_algo, data, measures=['RMSE', 'MAE']) print_perf(perf) ibcf_pearson_algo = KNNBasic(sim_options={ 'name': 'pearson', 'user_based': False }) perf = evaluate(ibcf_pearson_algo, data, measures=['RMSE', 'MAE']) print_perf(perf)