def getSimilarUsers(data,n,user_id): if user_id not in data['userID']: return [] df = pd.DataFrame(data) reader = Reader() data = Dataset.load_from_df(df[['userID', 'eventID', 'rating']], reader) trainSet = data.build_full_trainset() sim_options = {'name': 'cosine','user_based': True} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() testUserInnerID = trainSet.to_inner_uid(user_id) similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != testUserInnerID): similarUsers.append( (innerID, score) ) kNeighbors = heapq.nlargest(n, similarUsers, key=lambda t: t[1]) similarUsers=[] for simUser in kNeighbors: similarUsers.append(simUser[0]) return similarUsers
def build_rec_list(**params): ml = RatingsLoader() data = ml.loadDataset() trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBasic(sim_options=sim_options) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions) Recommend = Object.extend('Recommend') for uid, user_ratings in top_n.items(): rec = Recommend() rec.set('uId', uid) rec.set('pIds', [iid for (iid, _) in user_ratings]) rec.set('pTitles', [ml.getProductName(int(iid)) for (iid, _) in user_ratings]) rec.set( 'products', [json.dumps(ml.getProduct(int(iid))) for (iid, _) in user_ratings]) rec.save() print('is success run')
def test_nearest_neighbors(): """Ensure the nearest neighbors are different when using user-user similarity vs item-item.""" reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5)) trainset = data.build_full_trainset() algo_ub = KNNBasic(sim_options={'user_based': True}) algo_ub.fit(trainset) algo_ib = KNNBasic(sim_options={'user_based': False}) algo_ib.fit(trainset) assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)
def EvaluateAllModels(self): """ test_rmse fit_time test_time Algorithm SVDpp 0.965824 9.401286 0.151476 SVD 0.967286 1.474139 0.062471 BaselineOnly 0.972408 0.108964 0.057277 NMF 0.992677 4.073005 0.171846 KNNWithZScore 1.001898 0.620192 0.083341 KNNWithMeans 1.002924 0.489803 0.078121 SlopeOne 1.006664 19.091191 1.275676 KNNBaseline 1.007437 0.890452 0.088495 KNNBasic 1.016717 0.432159 0.072929 NormalPredictor 1.253265 0.041646 0.078105 CoClustering 1.828291 3.020921 0.052071 :return: test_rmse sonucu en düşük olan alınır. """ benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # Perform cross validation results = cross_validate(algorithm, self.data, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) result = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') print(result) return result
def knn(data, training, testing): ''' Tune Basic KNN parameters then calculates RMSE, coverage and running time of Basic KNN Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of Basic KNN with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False]}} # optimize parameters knn_grid_search = GridSearch(KNNBasic, knn_param_grid, measures=['RMSE'], verbose=False) knn_grid_search.evaluate(data) param = knn_grid_search.best_params['RMSE'] print('KNNBasic:', param) # RMSE against parameters result_df = pd.DataFrame.from_dict(knn_grid_search.cv_results) result_df.to_csv('data/knn_rmse_against_param.csv') # fit model using the optimized parameters knn = KNNBasic(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based'] ) knn.train(training) # evaluate the model using test data predictions = knn.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def gen_pred_matrix_ubcf(co_pe): # ---------------------------------------------------- UBCF as is # INITIALIZE REQUIRED PARAMETERS path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe print "CF Type:", prnt, "BASED" testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) # ---------------------------------------------------- UBCF as is csvfile = 'pred_matrix-full_ubcf.csv' with open(csvfile, "w") as output: writer = csv.writer(output, delimiter=',', lineterminator='\n') writer.writerow(['uid', 'iid', 'rat']) for uid, user_ratings in top_n.items(): for (iid, r) in user_ratings: value = uid, iid, r writer.writerow(value) print "Done! You may now check the file in same Dir. as of Program"
def user_based_cf(co_pe): # INITIALIZE REQUIRED PARAMETERS path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe # --------------------------------------------- MARKERS f = io.open("AlgoHist_ub.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "CF Type:", prnt, "BASED" # PEEKING PREDICTED VALUES search_key = raw_input("Enter User ID:") item_id = raw_input("Enter Item ID:") actual_rating = input("Enter actual Rating:") print algo.predict(str(search_key), item_id, actual_rating) testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions,5) result_u = True k = input("Enter size of Neighborhood (Min:1, Max:40)") inner_id = algo.trainset.to_inner_iid(search_key) neighbors = algo.get_neighbors(inner_id, k=k) print "Nearest Matching users are:" for i in neighbors: print "\t "*6,i return top_n, result_u
class RecommenderUserBased(Recommender): def __init__(self, movies, similarity='cosine'): super(RecommenderUserBased, self).__init__(movies) sim_options = {'name': similarity, 'user_based': True} self.algorithm = KNNBasic(sim_options=sim_options) def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20, k_inner_item=200): full_dataset = self.algorithm.trainset # watched movies watched = { full_dataset.to_inner_iid(key): value for key, value in watched.items() } # get similar users similar_users = self.get_similar_user_ids(watched, k=k_inner_item) # get most similar items, based on cosine similarity and most similar users candidates = defaultdict(float) for user_id, similarity in similar_users.items(): for inner_movie_id, rate in full_dataset.ur[user_id]: if inner_movie_id not in watched: candidates[inner_movie_id] += similarity * rate # return top-n movies movie_ids = [ full_dataset.to_raw_iid(i) for i in heapq.nlargest(k, candidates, key=candidates.get) ] return self.movies.get_movie_by_movie_ids(movie_ids)
def algoFunc(train_data, test_data): SVD_var = SVD() print("Singular Value Decomposition :\n") SVD_var.fit(train_data) predict_var = SVD_var.test(test_data) SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True) SVD_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nProbabilistic Matrix Factorization :\n") PMF_var = SVD(biased=False) PMF_var.fit(train_data) predict_var = PMF_var.test(test_data) PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) PMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nNon-negative Matrix Factorization :\n") NMF_var = NMF() NMF_var.fit(train_data) predict_var = NMF_var.test(test_data) NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) NMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nUser based Collaborative Filtering algorithm :\n") UB_var = KNNBasic(sim_options={'user_based': True}) UB_var.fit(train_data) predict_var = UB_var.test(test_data) user_RMSE_var = accuracy.rmse(predict_var, verbose=True) user_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nItem based Collaborative Filtering algorithm :\n") IB_var = KNNBasic(sim_options={'user_based': False}) IB_var.fit(train_data) predict_var = IB_var.test(test_data) item_RMSE_var = accuracy.rmse(predict_var, verbose=True) item_MAE_var = accuracy.mae(predict_var, verbose=True) print("\n") return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
def problem14(): plotRMSE = [] plotMAE = [] print("-----MSD similarity in User based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': True}) user_MSD = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append(["User-based filtering", 1, user_MSD["test_rmse"].mean()]) plotMAE.append(["User-based filtering", 1, user_MSD["test_mae"].mean()]) print("-----Cosine similarity in User based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) user_COS = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append(["User-based filtering", 2, user_COS["test_rmse"].mean()]) plotMAE.append(["User-based filtering", 2, user_COS["test_mae"].mean()]) print("-----Pearson similarity in User based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': True}) user_Pearson = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append( ["User-based filtering", 3, user_Pearson["test_rmse"].mean()]) plotMAE.append( ["User-based filtering", 3, user_Pearson["test_mae"].mean()]) print("-----MSD similarity in Item based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'MSD', 'user_based': False}) item_MSD = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append(["Item-based filtering", 1, item_MSD["test_rmse"].mean()]) plotMAE.append(["Item-based filtering", 1, item_MSD["test_mae"].mean()]) print("-----Cosine similarity in Item based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) item_Cos = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append(["Item-based filtering", 2, item_Cos["test_rmse"].mean()]) plotMAE.append(["Item-based filtering", 2, item_Cos["test_mae"].mean()]) print("-----Pearson similarity in Item based Collaborative Filtering----") algo = KNNBasic(sim_options={'name': 'pearson', 'user_based': False}) item_Pearson = cross_validate(algo, data, cv=3, verbose=False) plotRMSE.append( ["Item-based filtering", 3, item_Pearson["test_rmse"].mean()]) plotMAE.append( ["Item-based filtering", 3, item_Pearson["test_mae"].mean()]) plotRMSE = pd.DataFrame(data=plotRMSE, columns=["Filter", "Similarity", "RMSE"]) plotRMSE.pivot("Similarity", "Filter", "RMSE").plot(kind="bar") plt.title("User vs Item (RMSE)") plt.ylabel("RMSE") plt.ylim(.9, 1.1) plt.show() plotMAE = pd.DataFrame(data=plotMAE, columns=["Filter", "Similarity", "MAE"]) plotMAE.pivot("Similarity", "Filter", "MAE").plot(kind="bar") plt.title("User vs Item (MAE)") plt.ylabel("MAE") plt.ylim(.7, .9) plt.show()
def use_knn(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using KNN') algo_KNN = KNNBasic() algo_KNN.fit(trainset) testset = trainset.build_anti_testset() predictions_KNN = algo_KNN.test(testset) accuracy_rmse = accuracy.rmse(predictions_KNN) accuracy_mae = accuracy.mae(predictions_KNN) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def __recommend_movies(self, username): reader = Reader(rating_scale=(1, 10)) df = pd.DataFrame(self.ratings_dict) data = Dataset.load_from_df(df[["user", "item", "rating"]], reader) sim_options = { "name": "cosine", 'user_based': True, # 'min_support': 2 } algo = KNNBasic(sim_options=sim_options) # algo = SVD() algo.fit(data.build_full_trainset()) self.__get_all_movies() for movies in self.movies: prediction = algo.predict(username, movies) self.predictions[movies] = prediction.est for user_rated_movies in self.__get_user_rated_movies( self.__get_username_id(username)): del self.predictions[user_rated_movies]
def UBCFMSD(): file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=3) algo = KNNBasic( #k=x, sim_options={ 'name': 'MSD', 'user_based': True }) perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
def problem15(): plotNeighbors = [] i = 1 while i < 17: algo = KNNBasic(k=i, sim_options={'name': 'MSD', 'user_based': True}) user = cross_validate(algo, data, cv=3, verbose=False) plotNeighbors.append([ "User based Collobarative Filtering", i, user["test_rmse"].mean() ]) algo = KNNBasic(k=i, sim_options={'name': 'MSD', 'user_based': False}) item_MSD = cross_validate(algo, data, cv=3, verbose=False) plotNeighbors.append([ "Item based Collaborative Filtering", i, item_MSD["test_rmse"].mean() ]) i += 1 plotDF = pd.DataFrame(data=plotNeighbors, columns=["Classifier", "K", "Score"]) plotDF.pivot("K", "Classifier", "Score").plot(kind="bar") plt.ylim(0.8, 1.6) plt.title("User/Item based collaborative filtering in terms of k-value") plt.ylabel("RMSE") plt.show()
def fit_model_surprise_basic(df, k): import time from surprise import Reader, Dataset from surprise import KNNBasic, KNNWithMeans, SVD, SVDpp from surprise.model_selection import train_test_split from sklearn.metrics import roc_auc_score import pandas as pd start_time = time.time() reader = Reader(rating_scale=(0, 1)) data_r = Dataset.load_from_df(df[['userid', 'itemid', 'event']], reader) daftar_algo = { "KNNBasicUser": KNNBasic(sim_options={"user_based": True}), "KNNBasicItem": KNNBasic(sim_options={"user_based": False}), "KNNWithMeanItem": KNNWithMeans(sim_options={"user_based": False}), "KNNWithMeanUser": KNNWithMeans(sim_options={"user_based": True}), "SVD": SVD(), "SVDnoBias": SVD(biased=False), "SVDpp": SVDpp() } trainset, testset = train_test_split(data_r, test_size=0.25) algo = daftar_algo[k] algo.fit(trainset) #Buat prediksi predictions = algo.test(testset) pred = pd.DataFrame(predictions) pred.r_ui.replace({1.0: "transaction", 0.0: "view"}, inplace=True) pred.r_ui.replace({ "view": 0, "addtocart": 0, "transaction": 1 }, inplace=True) auc = roc_auc_score(pred.r_ui, pred.est) end_time = time.time() return auc, end_time - start_time
def calculateRMSE(self, method=9, similarityMeasure=1, isUserBased="Yes"): conn = sqlite3.connect(DATABASE_NAME) df = pd.read_sql_query( "SELECT userID, glassID, relativeRating FROM ratings", conn) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( df[['userID', 'glassID', 'relativeRating']], reader) trainset, testset = train_test_split(data, test_size=.20) isUserBased = True if (isUserBased == "Yes") else False if similarityMeasure == 1: similarityMeasure = "cosine" elif similarityMeasure == 2: similarityMeasure = "pearson" else: similarityMeasure = "pearson_baseline" sim_options = {'name': similarityMeasure, 'user_based': isUserBased} if method == 1: algo = SVD() elif method == 2: algo = SlopeOne() elif method == 3: algo = NMF() elif method == 4: algo = NormalPredictor() elif method == 5: algo = KNNBaseline(sim_options=sim_options) elif method == 6: algo = KNNBasic(sim_options=sim_options) elif method == 7: algo = KNNWithMeans(sim_options=sim_options) elif method == 8: algo = KNNWithZScore(sim_options=sim_options) elif method == 9: algo = BaselineOnly() else: algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) conn.close() #cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) return round(accuracy.rmse(predictions, verbose=False), 4)
def train_model(new_users, data, neighbors = 30, min_neighbors = 5, seed = 12345): """ Trains the KNN Basic model using the surprise package using the existing ratings data combined with all the new user possible combinations Args: new_users (pandas.Dataframe): The dataframe with the 'ratings' of all the possible combinations of user input data (pandas.Dataframe): The existing ratings dataframe neighbors (int): the number of nearest neighbors to train the model on, default is 30 min_neighbors (int): the minimum number of neighbors a user must have to receive a prediction. If there are not enough neighbors, the prediction is set the the global mean of all ratings default is 5. seed (int): setting the random state, default is 122345 Returns: predictions (list of prediction objects): The predicted recommendations from the model """ #ensure a nice distribution of ratings ratings_counts = data['rating'].value_counts().to_dict() logger.info("Ratings Distributions:") logger.info(ratings_counts) #combine actual ratings with all possible ratings users could input full_data = new_users.append(data) #use surprise Reader function to read in data in surprise format reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(full_data[['user_id', 'book_id', 'rating']], reader) trainset = data.build_full_trainset() algo = KNNBasic(k=neighbors, min_k=min_neighbors, random_state=seed) algo.fit(trainset) # predict all the cells without values testset = trainset.build_anti_testset() predictions = algo.test(testset) return predictions
def fit(self, trainset): self.trainset = trainset self.trainset.rating_scale = (1, 13) AlgoBase.fit(self, trainset) # sim_options = {'name': 'cosine', # 'user_based': True # } model = KNNBasic(sim_options=self.sim_options, k = self.k) model.fit(trainset) simsMatrix = model.compute_similarities() for userId in range(trainset.n_users): similarityRow = simsMatrix[userId] kNeighbors = heapq.nlargest(10, [(innerId, score) for (innerId, score) in enumerate(similarityRow) if innerId!=userId], key=lambda t: t[1]) self.nearestNeigbors[userId] = kNeighbors print("...done.") return self
def RecommendMovie(user_id): # user_id = input((" UserID ")) np.random.seed(0) random.seed(0) (ml, evaluationData, rankings) = LoadMovieLensData() evaluator = Evaluator(evaluationData, rankings) UserKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': True}) evaluator.AddAlgorithm(UserKNN, "User KNN") res = evaluator.SampleTopNRecs(ml, testSubject=user_id) return res
def __init__(self, modelName, dataPath): self.modelDict = { "KNNBasic": KNNBasic(), "KNNWithMeans": KNNWithMeans(), "KNNWithZScore": KNNWithZScore(), "SVD": SVD(), "SVDpp": SVDpp(), "NMF": NMF(), "SlopeOne": SlopeOne(), "CoClustering": CoClustering() } self.trainset = None self.testset = None self.data = None self.model = self.modelDict[modelName] self.loadData(os.path.expanduser(dataPath))
def data_model_forcrossvalidation(data, config_train): """Creates data and empty model for use in kfold_crossvalidation function Arguments: data {pd.DataFrame} -- Pandas DataFrame config_train {dict} -- Dictionary of configurations corresponding to the train_model script Returns: data {surprise.dataset.DatasetAutoFolds} -- Surprise Dataset ready for cross validation model {surprise.prediction_algorithms.knns.KNNBasic} -- Surprise KNNBasic Model """ t_configs = config_train['build_trainset'] #configurations for trainset data = data[t_configs['colnames']] #colnames configuration reader = Reader() data = Dataset.load_from_df(data, reader) #create surprise dataset model = KNNBasic(**config_train['create_KNNmodel']) #create knnmodel return data, model
def ComputeCollaborativeFiltering_User_User(recipe_df, train_rating_df, pd, benchmark, knnmeans=False): print("\n###### Compute CollaborativeFiltering_User_User ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) # compute similarities between items sim_options = {'name': 'cosine', 'user_based': True} if knnmeans: algo = KNNWithMeans(sim_options=sim_options, verbose=False) else: algo = KNNBasic(sim_options=sim_options, verbose=False) algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def __init__(self, df, algo='KNN', user_based=False): self.df = df self.algo = algo self.user_based = user_based reader = Reader(line_format='user item rating') data = Dataset.load_from_df(df=self.df, reader=reader) self.eval_data = EvaluationData(data) if self.algo == 'KNN': sim_options = {'name': 'cosine', 'user_based': self.user_based} self.model = KNNBasic(sim_options=sim_options) elif self.algo == 'SVD': self.model = SVD() elif self.algo == 'SVD++': self.model = SVDpp() elif self.algo == 'Random': self.model = NormalPredictor()
def knn_running_time(data): ''' Calculates the running times for training and predictions for Basic KNN Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_KnnBasictrain: running time for training elapsedtime_KnnBasictest: running time for predictions on testset ''' elapsedtime_KnnBasictrain = [] elapsedtime_KnnBasictest = [] # tune the parameters on the entire data param_grid = { 'k': [5, 10, 20], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False] } } grid_search = GridSearch(KNNBasic, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] k = param['k'] sim = param['sim_options']['name'] min_support = param['sim_options']['min_support'] user_based = param['sim_options']['user_based'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() knn = KNNBasic(k=k, name=sim, min_support=min_support, user_based=user_based) knn.train(training) elapsedtime_KnnBasictrain.append(time.time() - training_start) # prediction running time test_start = time.time() knn.test(testing) elapsedtime_KnnBasictest.append(time.time() - test_start) return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
def main(): book_df = pd.read_csv("../../data/processed/filtered_ratings.csv") # Reader object and rating scale specification book_df = book_df.drop('Unnamed: 0', axis=1) reader = Reader(rating_scale=(1, 5)) # Load data data = Dataset.load_from_df(book_df[["user_id", "book_id", "rating"]], reader) # Spilt data into train and test sets train_set, test_set = train_test_split(data, test_size=0.20) algorithm_list = [ NormalPredictor(), BaselineOnly(), KNNWithZScore(k=10, sim_options=similarity_measure('pearson', 1)), KNNWithMeans(k=10, sim_options=similarity_measure('pearson', 1)), KNNBaseline(k=10, sim_options=similarity_measure('pearson', 1)), KNNBasic(k=10, sim_options=similarity_measure('pearson', 1)), SVDpp(), SVD(), NMF() ] # # Fit model for normal predictor and get rmse # basic_model_based(train_set, test_set, NormalPredictor()) # # # Fit model for Baselineonly algorithm # basic_model_based(train_set, test_set, BaselineOnly()) # # # Fit model for KNN algorithms # basic_model_based(train_set, test_set, KNNBasic(k=10, sim_options=similarity_measure('pearson', 1))) # # plot_for_rmse(train_set, test_set) # Crossvalidation results # res = crossvalidate(data) # print(res) results = {} for algo in algorithm_list: rmse, preci, recall, f1 = basic_model_based(train_set, test_set, algo) print("Algorithm:", algo, preci, recall, f1) print( "**------------------------------------------------------------------------------------------**" )
def get(self): (gb, evaluationData, rankings) = self.LoadGoodBooksData() evaluator = Evaluator(evaluationData, rankings) ItemKNN = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) evaluator.AddAlgorithm(ItemKNN, "Item KNN") evaluator.Evaluate(False) book_ids = evaluator.SampleTopNRecs(gb, testSubject=12) conn = self.get_db() cur = conn.cursor() # convert each book_id to a string, then join them with ', ' as a seperator. SQL_book_ids = ', '.join([str(x) for x in book_ids]) cur.execute( "SELECT * FROM books WHERE book_id IN ({})".format(SQL_book_ids)) books = cur.fetchall() conn.commit() return books
def checkBestAlgorithm(self): self.df = pd.read_csv(csv_name) reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']], reader) benchmark = [] rmseTuple = [] # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다. for algorithm in [ SVD(), SVDpp(), SlopeOne(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering() ]: # 교차검증을 수행하는 단계. results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) # 결과 저장과 알고리즘 이름 추가. tmp = pd.DataFrame.from_dict(results).mean(axis=0) rmseTuple.append((algorithm, tmp['test_rmse'])) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print( pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse')) print("\n") rmseTuple.sort(key=lambda x: x[1]) print("Best algorithm : ") print(str(rmseTuple[0]).split(' ')[0].split('.')[-1]) return rmseTuple[0]
def build_collabrative_model(userData,mode='svd'): mode_opts = ['knn','knn_with_means','svd','svd++'] assert mode in mode_opts, "Invalid mode. Choose from "+str(mode_opts) from surprise import Reader, Dataset # from surprise.model_selection import cross_validate, train_test_split # from surprise import accuracy reader = Reader() userData = Dataset.load_from_df(userData[['userId', 'movieId', 'rating']].astype('str'), reader) # trainset, testset = train_test_split(userData, test_size=0) trainset = userData.build_full_trainset() model = None if mode == "knn": from surprise import KNNBasic model = KNNBasic(verbose=True) elif mode=='knn_with_means': from surprise import KNNWithMeans # To use item-based cosine similarity use user_based = False sim_options = { "name": "cosine", "user_based": True, # Compute similarities between items } model = KNNWithMeans(verbose=True,sim_options=sim_options) elif mode == "svd": from surprise import SVD model = SVD(verbose=True) elif mode == "svd++": from surprise import SVDpp model = SVDpp(verbose=True) model.fit(trainset) return model
def knn_basic_movie(train, test, ids, Xtest, Xids): """ kNN basic approach on movies Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Basic Movie') algo = KNNBasic(k=21, name='msd', min_support=2, user_based=False, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def set_algo(name="cosine", user_based=True, algo_type="KNNBasic"): '''Function to facilitate switching between different algorithms ''' # To use item-based cosine similarity sim_options = { "name": name, "user_based": user_based, # Compute similarities between user or items } if algo_type == "KNNBasic": algo = KNNBasic(k=10, min_k=1, sim_options=sim_options) elif algo_type == "KNNWithMeans": algo = KNNWithMeans(k=10, min_k=1, sim_options=sim_options) elif algo_type == "KNNWithZScore": algo = KNNWithZScore(k=10, min_k=1, sim_options=sim_options) else: raise NameError('Unknown algorithm type.') return algo
def __init__(self, MainDir, ExpName): self.MODEL_NAME = "MODEL_NAME" self.FEATURE_PATH = "FEATURE_PATH" self.TEST_FEATURE_PATH = "TEST_FEATURE_PATH" self.MODEL_DICT = { "KNNBasic": KNNBasic(), "KNNWithMeans": KNNWithMeans(), "KNNWithZScore": KNNWithZScore(), "SVD": SVD(), "SVDpp": SVDpp(), "NMF": NMF(), "SlopeOne": SlopeOne(), "CoClustering": CoClustering() } self.TOP_RECOMMEND_RESULT_NUM = "TOP_RECOMMEND_RESULT_NUM" self.MODEL_PATH = "MODEL_PATH" self.HYPER_PARAMETER = "HYPER_PARAMETER" self.ONLINE_EXP_TYPE = "ONLINE" self.OFFLINE_EXP_TYPE = "OFFLINE" self.CONFIG_RELATIVE_PATH = "/inference/configuration/config.json" self.CONTENT_CONFIG = "CONTENT_CONFIG" self.REC_NUM = "REC_NUM" self.CONTENT_FEATURE_PATH = "CONTENT_FEATURE_PATH" self.MAIN_DIR_PATH = MainDir self.ExpName = ExpName self.ExpType = None self.trainset, self.testset, self.rawMovieList, self.rawUserList = None, None, None, None self.loadConfig(self.MAIN_DIR_PATH + self.CONFIG_RELATIVE_PATH, ExpName) if self.MODEL_NAME in self.config and self.config[ self.MODEL_NAME] in self.MODEL_DICT: self.model = self.MODEL_DICT[self.config[self.MODEL_NAME]] else: raise AttributeError("Model Initilization error") self.contentModel = ContentBaseModel.ContentModel( self.content_config[self.REC_NUM], self.MAIN_DIR_PATH + self.content_config[self.CONTENT_FEATURE_PATH])