def randomize(): sim_options_cosine = {'name': 'cosine', 'user_based': False} sim_options_msd = {'name': 'msd', 'user_based': False} sim_options_pearson = {'name': 'pearson', 'user_based': False} sim_options_baseline = { 'name': 'pearson_baseline', 'user_based': False, 'shrinkage': 0 } algorithms = [ ('kNN Basic - Cosine', KNNBasic(sim_options=sim_options_cosine, verbose=False)), ('kNN Basic - MSD', KNNBasic(sim_options=sim_options_msd, verbose=False)), ('kNN Basic - Pearson', KNNBasic(sim_options=sim_options_pearson, verbose=False)), ('kNN Basic - Pearson B', KNNBasic(sim_options=sim_options_baseline, verbose=False)), ('kNN Means - Cosine', KNNWithMeans(sim_options=sim_options_cosine, verbose=False)), ('kNN Means - MSD', KNNWithMeans(sim_options=sim_options_msd, verbose=False)), ('kNN Means - Pearson', KNNWithMeans(sim_options=sim_options_pearson, verbose=False)), ('kNN Means - Pearson B', KNNWithMeans(sim_options=sim_options_baseline, verbose=False)), ('kNN Z - Cosine', KNNWithZScore(sim_options=sim_options_cosine, verbose=False)), ('kNN Z - MSD', KNNWithZScore(sim_options=sim_options_msd, verbose=False)), ('kNN Z - Pearson', KNNWithZScore(sim_options=sim_options_pearson, verbose=False)), ('kNN Z - Pearson B', KNNWithZScore(sim_options=sim_options_baseline, verbose=False)), ('kNN Baseline - Cosine', KNNBaseline(sim_options=sim_options_cosine, verbose=False)), ('kNN Baseline - MSD', KNNBaseline(sim_options=sim_options_msd, verbose=False)), ('kNN Baseline - Pearson', KNNBaseline(sim_options=sim_options_pearson, verbose=False)), ('kNN Baseline - Pearson B', KNNBaseline(sim_options=sim_options_baseline, verbose=False)), ('SVD', SVD(verbose=False)), ('SVDpp', SVDpp(verbose=False)), ('Baseline Only', BaselineOnly(verbose=False)), ('CoClustering', CoClustering(verbose=False)), ('SlopeOne', SlopeOne()), ('NMF', NMF(verbose=False)) ] random_ = random.randint(0, len(algorithms)) return algorithms[random_]
def get_algo(algo_id): #Define o algoritimo usado com base no segundo parametro da linha de comando #KNN com Zscore itembased if (algo_id == 2): algo = KNNWithZScore(user_based=False) #SVD com userbased elif (algo_id == 3): algo = KNNWithZScore(user_based=True) #KNN com Zscore userbased else: algo = KNNWithZScore(user_based=True) return algo
def generate_knn(self,rating_data): """ here we separate untuned and tuned algo as it might take a really long time on tuning, it's easier to comment out the tuning part if needed Args: param1: rating_data: the main data set Return: a dictionary of algorithms; key: name of algo, val: algo object """ algo = {} bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) algo['bcKNN'] = bcKNN wmKNN = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}) algo['wmKNN'] = wmKNN wzKNN = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True}) algo['wzKNN'] = wzKNN blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True}) algo['blKNN'] = blKNN # tune param for knnBaseline, since it has best accuracy param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]} best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl) blKNN_tuned = KNNBaseline(k=best_params_bl['k']) algo.update({'blKNN_tuned': blKNN_tuned}) return algo
def CFZ(self): u_id = [] I_id = [] r_ui_ = np.array([]) _est = np.array([]) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) u_id.append(int(pred.uid)) I_id.append(int(pred.iid)) r_ui_ = np.append(r_ui_, pred.r_ui) _est = np.append(_est, pred.est) self.df_est = pd.DataFrame({ 'uid': u_id, 'Iid': I_id, 'r_ui': r_ui_, 'est': _est }) self.arr = self.df_est['uid'].unique() self.CFWZ_ndcg_ = self.Calculate_NDCG()
def crossvalidate(data): results = [] for algorithm in [ NormalPredictor(), KNNBaseline(k=15, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=15, sim_options=similarity_measure('pearson', 1)), KNNWithZScore(k=15, sim_options=similarity_measure('pearson', 1)), BaselineOnly(), SVD(), SVDpp(), NMF(), SlopeOne(), CoClustering() ]: result = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False) temp = pd.DataFrame.from_dict(result).mean(axis=0) temp = temp.append( pd.Series([str(algorithm).split(' ')[0].split(".")[-1]], index=['Algorithm'])) results.append(temp) rmse_values = pd.DataFrame(results).set_index('Algorithm').sort_values( 'test_rmse') return rmse_values
def generate_knn(self, rating_data): algo = {} bcKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) algo['bcKNN'] = bcKNN wmKNN = KNNWithMeans(sim_options={ 'name': 'cosine', 'user_based': True }) algo['wmKNN'] = wmKNN wzKNN = KNNWithZScore(sim_options={ 'name': 'cosine', 'user_based': True }) algo['wzKNN'] = wzKNN blKNN = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True}) algo['blKNN'] = blKNN # tune param for knnBaseline, since it has best accuracy param_grid_bl = {'k': [10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100]} best_params_bl = self.tune_and_find_parameter('blKNN', KNNBaseline, rating_data, param_grid_bl) blKNN_tuned = KNNBaseline(k=best_params_bl['k']) algo.update({'blKNN_tuned': blKNN_tuned}) return algo
def compAlgos(data): #Compare MAE, RMSE values for different algorithms print("\nLet us compare performance of KNN and SVD algorithms\n") #KNN Algos knn_Basic = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False) knn_means = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False) knn_z = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False) #SVD Algos svd = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False) svdpp = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False) print('\nKNN Basic: RMSE: {}, MAE: {}'.format( knn_Basic['test_rmse'].mean(), knn_Basic['test_mae'].mean())) print('\nKNN Means: RMSE: {}, MAE: {}'.format( knn_means['test_rmse'].mean(), knn_means['test_mae'].mean())) print('\nKNN Z Score: RMSE: {}, MAE: {}'.format(knn_z['test_rmse'].mean(), knn_z['test_mae'].mean())) print('\nSVD: RMSE: {}, MAE: {}'.format(svd['test_rmse'].mean(), svd['test_mae'].mean())) print('\nSVD ++: RMSE: {}, MAE: {}'.format(svdpp['test_rmse'].mean(), svdpp['test_mae'].mean())) print('\nBoth SVDs perform better on the dataset\n') print( '\nWe will test with KNN means from KNN family and SVDPP from svd family\n' )
def cal_KNNWithZScore(trainset, df): # KNN With ZScore sim_options = {'name': 'cosine', 'user-based': True} algo_knnz = KNNWithZScore(k=40, min_k=1, sim_options=sim_options) algo_knnz.fit(trainset) users = [] items = [] real = [] estimate = [] for i in range(len(df)): uid = df[i:i + 1].user.values[0] users.append(uid) iid = df[i:i + 1].store.values[0] items.append(iid) r_ui = df[i:i + 1].stars.values[0] real.append(r_ui) pred = algo.predict(uid, iid, r_ui, verbose=True) estimate.append(pred) print("end") # knn With Means df5 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est']) df5['user'] = users df5['item'] = items df5['r_ui'] = real df5['est'] = estimate #df3.head() df5['est'] = df5['est'].apply(lambda x: x[-2]) df5['err'] = abs(df5.est - df5.r_ui) df5.to_csv(save_file2)
def EvaluateDifferentAlgorithms(): benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, data_6months, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse'))
def benchmark(data): performance = [] algorithms = [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering(), SVD_SGD_momentum(), SVDpp_SGD_momentum() ] for algorithm in algorithms: results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', 'FCP'], cv=3, verbose=False) output = pd.DataFrame.from_dict(results).mean(axis=0) output = output.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) performance.append(output) output_df = pd.DataFrame(performance).set_index('Algorithm').sort_values( 'test_rmse') store_dataframe(output_df, 'Algorithm_Benchmark.csv')
def get_model(model_name): algo = None if 'KNN' in model_name: model_name = model_name.split('_') knn_model_name = model_name[0] user_based = False if len( model_name) > 1 and model_name[1] == 'I' else True dis_method = 'msd' if len(model_name) < 3 else model_name[2] k = 20 if len(model_name) < 4 else int(model_name[3]) sim_options = {'user_based': user_based, 'name': dis_method} if knn_model_name == 'KNNBasic': algo = KNNBasic(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithMeans': algo = KNNWithMeans(sim_options=sim_options, k=k) elif knn_model_name == 'KNNWithZScore': algo = KNNWithZScore(sim_options=sim_options, k=k) elif 'SVDpp' in model_name or 'SVD' in model_name or 'NMF' in model_name: model_name = model_name.split('_') n_factors = 25 if len(model_name) == 1 else int(model_name[1]) if model_name[0] == 'SVDpp': algo = SVDpp(n_factors=n_factors) elif model_name[0] == 'SVD': algo = SVD(n_factors=n_factors) elif model_name[0] == 'NMF': algo = NMF(n_factors=n_factors) return algo
def check_for_args(): args = sys.argv for arg in args: if (arg == 'SVD'): alg_list.append(SVD()) elif (arg == 'SVDpp'): alg_list.append(SVDpp()) elif (arg == 'SlopeOne'): alg_list.append(SlopeOne()) elif (arg == 'NMF'): alg_list.append(NMF()) elif (arg == 'NormalPredictor'): alg_list.append(NormalPredictor()) elif (arg == 'KNNBaseline'): alg_list.append(KNNBaseline()) elif (arg == 'KNNBasic'): alg_list.append(KNNBasic()) elif (arg == 'KNNWithMeans'): alg_list.append(KNNWithMeans()) elif (arg == 'KNNWithZScore'): alg_list.append(KNNWithZScore()) elif (arg == 'BaselineOnly'): alg_list.append(BaselineOnly()) elif (arg == 'CoClustering'): alg_list.append(CoClustering()) return alg_list
def knn_z(data, training, testing): ''' Tune KNN with Z-score parameters then calculates RMSE, coverage and running time of KNN with Z-score Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of KNN with Z-score with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5],'user_based': [False]}} # optimize parameters knnz_grid_search = GridSearch(KNNWithZScore, knn_param_grid, measures=['RMSE'], verbose=False) knnz_grid_search.evaluate(data) param = knnz_grid_search.best_params['RMSE'] print('KNNWithZScore:', param) # fit model using the optimized parameters knnz = KNNWithZScore(k = param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based']) knnz.train(training) # evaluate the model using test data predictions = knnz.test(testing) rmse = accuracy.rmse(predictions, verbose=True) top_n = get_top_n(predictions, n=5) return rmse, top_n
def run_baselines(ratings_dict, compressed_test_ratings_dict, data_origin): for alg in algos: if alg == "KNNBasic": algo = KNNBasic() elif alg == "KNNWithZScore": algo = KNNWithZScore() elif alg == "SVD": algo = SVD() elif alg == "NMF": algo = NMF() elif alg == "SlopeOne": algo = SlopeOne() elif alg == "CoClustering": algo = CoClustering() if data_origin == 'netflix': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'netflix') elif data_origin == 'small': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, 'small') elif data_origin == '100k': nr_predictions, accuracy, rmse, mae, precision, recall, f1 = testing( algo, ratings_dict, compressed_test_ratings_dict, '100k') # print results print("\n\nAlg %s" % alg) print("Number of user-items pairs: %d" % nr_predictions) print("Accuracy: %.2f " % accuracy) print("RMSE: %.2f" % rmse) print("MAE: %.2f" % mae) print("Precision: %.2f" % precision) print("Recall: %.2f" % recall) print("F1: %.2f" % f1)
def to_test(k, option, model): df = pd.read_csv('training_set.dat') test_df = pd.read_csv('test_set.dat') reader = Reader(rating_scale=(1, 5)) trainingSet = Dataset.load_from_df(df, reader).build_full_trainset() testSet = Dataset.load_from_df(test_df, reader).build_full_trainset().build_testset() opt = {'name': option, 'user_based': False} if model == 'Basic': algo = KNNBasic(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNBS.model", algo=algo, verbose=1) elif model == 'WithMeans': algo = KNNWithMeans(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNWM.model", algo=algo, verbose=1) elif model == 'WithZScore': algo = KNNWithZScore(k = k,sim_options = opt) algo.fit(trainingSet) # dump.dump("KNNWZS.model", algo=algo, verbose=1) elif model == 'Baseline': algo = KNNBaseline(k = k,sim_options = opt) algo.fit(trainingSet)
def _hyperopt(self, params): algo = KNNWithZScore(**params) return cross_validate(algo, self._data, measures=ACCURACY_METRICS, cv=self._cv, n_jobs=self._cv_n_jobs, verbose=self._debug)[self._metric].mean()
def get_model(model_name, sim_options): if model_name == 'KNNBasic': model = KNNBasic(sim_options=sim_options, verbose=False) elif model_name == 'KNNWithMeans': model = KNNWithMeans(sim_options=sim_options, verbose=False) elif model_name == 'KNNWithZScore': model = KNNWithZScore(sim_options=sim_options, verbose=False) elif model_name == 'KNNBaseline': model = KNNBaseline(sim_options=sim_options, verbose=False) return model
def get_model_old(model_name): algo = None if model_name == 'KNNBasic_U': sim_options = {'user_based': True} algo = KNNBasic(sim_options=sim_options, k=20) elif model_name == 'KNNBasic_I': sim_options = {'user_based': False} algo = KNNBasic(sim_options=sim_options, k=20) # algo = KNNBasic() elif model_name == 'KNNWithMeans_I': algo = KNNWithMeans(sim_options={'user_based': False}, k=20) elif model_name == 'KNNWithMeans_U': algo = KNNWithMeans(sim_options={'user_based': True}, k=20) elif model_name == 'KNNWithZScore_I': algo = KNNWithZScore(sim_options={'user_based': False}, k=20) elif model_name == 'KNNWithZScore_U': algo = KNNWithZScore(sim_options={'user_based': True}, k=20) elif model_name == 'SVDpp': algo = SVDpp() elif model_name == 'SVD': algo = SVD() elif model_name == 'NMF': algo = NMF() elif 'NMF_' in model_name: n_factors = int(model_name.split("_")[1]) algo = NMF(n_factors=n_factors) elif 'SVDpp_' in model_name: n_factors = int(model_name.split("_")[1]) algo = SVDpp(n_factors=n_factors) elif 'SVD_' in model_name: n_factors = int(model_name.split("_")[1]) algo = SVD(n_factors=n_factors) elif 'KNNBasic_U_' in model_name: k = int(model_name.split("_")[-1]) sim_options = {'user_based': True} algo = KNNBasic(sim_options=sim_options, k=k) elif 'KNNBasic_I_' in model_name: k = int(model_name.split("_")[-1]) sim_options = {'user_based': False} algo = KNNBasic(sim_options=sim_options, k=k) return algo
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def knnz_running_time(data): ''' Calculates the running times for training and predictions for KNN with Z-score Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_KnnZtrain: running time for training elapsedtime_KnnZtest: running time for predictions on testset ''' elapsedtime_KnnZtrain = [] elapsedtime_KnnZtest = [] # tune the parameters on the entire data param_grid = { 'k': [5, 10, 20], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False] } } grid_search = GridSearch(KNNWithZScore, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] k = param['k'] sim = param['sim_options']['name'] min_support = param['sim_options']['min_support'] user_based = param['sim_options']['user_based'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() knnz = KNNWithZScore(k=k, name=sim, min_support=min_support, user_based=user_based) knnz.train(training) elapsedtime_KnnZtrain.append(time.time() - training_start) # prediction running time test_start = time.time() knnz.test(testing) elapsedtime_KnnZtest.append(time.time() - test_start) return elapsedtime_KnnZtrain, elapsedtime_KnnZtest
def EvaluateAllModels(self): """ test_rmse fit_time test_time Algorithm SVDpp 0.965824 9.401286 0.151476 SVD 0.967286 1.474139 0.062471 BaselineOnly 0.972408 0.108964 0.057277 NMF 0.992677 4.073005 0.171846 KNNWithZScore 1.001898 0.620192 0.083341 KNNWithMeans 1.002924 0.489803 0.078121 SlopeOne 1.006664 19.091191 1.275676 KNNBaseline 1.007437 0.890452 0.088495 KNNBasic 1.016717 0.432159 0.072929 NormalPredictor 1.253265 0.041646 0.078105 CoClustering 1.828291 3.020921 0.052071 :return: test_rmse sonucu en düşük olan alınır. """ benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, self.data, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') print(result) return result
def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset, testset = train_test_split(data, test_size=.20) isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) conn.close() #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) return round(accuracy.rmse(predictions, verbose=False), 4)
def collab_recommender(train_data, test_data, user_based=True, normalization=False, k=100, sim='cosine'): """ Input: - train_data: dataframe, n*3, columns are ['userid','movieid','rating'] - test_data: dataframe, n*2, columns are ['userid', 'movieid'] - user_base: boolean, use user-based knn algorithm if True, use item-based knn algorithm if False - normalization: boolean, conduct z-score normalization on user/item matrix if True - k: int, number of nearest neighbors - sim: string, define the similarity matrix from ['cosine', 'pearson', 'msd', 'pearson_baseline'] Output: - pred_rating: dataframe, n*2, columns are ['movieid', 'rating'] """ try: function_log.trace('Start collaborative recommendation function') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(train_data, reader) sim_options = {'name': sim, 'user_based': user_based} if normalization: algo = KNNWithZScore(k=k, sim_options=sim_options, verbose=False) else: algo = KNNWithMeans(k=k, sim_options=sim_options, verbose=False) train_set = data.build_full_trainset() algo.fit(train_set) pred_rating = {'movieid': [], 'rating': []} for idx in test_data.index: pred_rating['movieid'].append(test_data.loc[idx, 'movieid']) pred = algo.predict(test_data.loc[idx, 'userid'], test_data.loc[idx, 'movieid']) pred_rating['rating'].append(pred.est) function_log.trace('Finish collaborative recommendation function') return pd.DataFrame(pred_rating) except ValueError: function_log.warn("Training and test data cannot be none.") raise ValueError except Exception as x: function_log.exception( f'collaborative recommendation function failed {x}')
def CFZ(self): sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) return pred
def __init__(self, modelName, dataPath): self.modelDict = { "KNNBasic": KNNBasic(), "KNNWithMeans": KNNWithMeans(), "KNNWithZScore": KNNWithZScore(), "SVD": SVD(), "SVDpp": SVDpp(), "NMF": NMF(), "SlopeOne": SlopeOne(), "CoClustering": CoClustering() } self.trainset = None self.testset = None self.data = None self.model = self.modelDict[modelName] self.loadData(os.path.expanduser(dataPath))
def CFZ(self): kf = KFold(n_splits=5) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNWithZScore(k=40, min_k=1, sim_options=sim_options) for trainset, testset in kf.split(self.data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = self.precision_recall_at_k(predictions) P = sum(prec for prec in precisions.values()) / len(precisions) R = sum(rec for rec in recalls.values()) / len(recalls) F1 = 2 * P * R / (P + R) print("Precision : ", P) print("Recall : ", R) print("F1 : ", F1)
def main(): book_df = pd.read_csv("../../data/processed/filtered_ratings.csv") # Reader object and rating scale specification book_df = book_df.drop('Unnamed: 0', axis=1) reader = Reader(rating_scale=(1, 5)) # Load data data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]], reader) # Spilt data into train and test sets train_set, test_set = train_test_split(data, test_size=0.20) algorithm_list = [ NormalPredictor(), BaselineOnly(), KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)), KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)), SVDpp(), SVD(), NMF() ] # # Fit model for normal predictor and get rmse # basic_model_based(train_set, test_set, NormalPredictor()) # # # Fit model for Baselineonly algorithm # basic_model_based(train_set, test_set, BaselineOnly()) # # # Fit model for KNN algorithms # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1))) # # plot_for_rmse(train_set, test_set) # Crossvalidation results # res = crossvalidate(data) # print(res) results = {} for algo in algorithm_list: rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo) print("Algorithm:", algo, preci, recall, f1) print( "**------------------------------------------------------------------------------------------**" )
def checkBestAlgorithm(self): self.df = pd.read_csv(csv_name) reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']], reader) benchmark = [] rmseTuple = [] # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다. for algorithm in [ SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # 교차검증을 수행하는 단계. results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) # 결과 저장과 알고리즘 이름 추가. tmp = pd.DataFrame.from_dict(results).mean(axis=0) rmseTuple.append((algorithm, tmp['test_rmse'])) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse')) print("\n") rmseTuple.sort(key=lambda x: x[1]) print("Best algorithm : ") print(str(rmseTuple[0]).split(' ')[0].split('.')[-1]) return rmseTuple[0]
def computeKNNZScoreMovie(data, test_np): """Compute the k-NN with z score item based method and return the predictions on the test The method is on all the data and got the following settings: - Similarity function : Pearson baseline, item based - Number of closest neighbors : 108 data : data frame which represent the train set test_np : data frame on which the prediction will be returned return : test_np with a column of prediction named 'knnzscore_item_rating'""" trainset, test = dataTrainSurprise(data, test_np) sim_options = {'name':'pearson_baseline','user_based': False} knnz_algo = KNNWithZScore(k = 108, sim_options =sim_options).fit(trainset) test['knnzscore_item_rating'] = test[['user_id', 'movie_id']] \ .apply(lambda row: knnz_algo.predict(row['user_id'], row['movie_id'])[3], axis=1) return test
def set_algo(name="cosine", user_based=True, algo_type="KNNBasic"): '''Function to facilitate switching between different algorithms ''' # To use item-based cosine similarity sim_options = { "name": name, "user_based": user_based, # Compute similarities between user or items } if algo_type == "KNNBasic": algo = KNNBasic(k=10, min_k=1, sim_options=sim_options) elif algo_type == "KNNWithMeans": algo = KNNWithMeans(k=10, min_k=1, sim_options=sim_options) elif algo_type == "KNNWithZScore": algo = KNNWithZScore(k=10, min_k=1, sim_options=sim_options) else: raise NameError('Unknown algorithm type.') return algo