def results(): names = ['userID', 'itemID', 'rating'] df = pd.read_csv('~/.surprise_data/ratings.csv', names=names) names1 = ['itemID', 'Profession', 'City'] df1 = pd.read_csv('~/.surprise_data/workers1.csv', names=names1) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} algo = KNNBasic(k=40, min_k=1, sim_options={}) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) myArray = [] for uid, user_ratings in top_n.items(): abcd = [] #abcd.append(iid for (iid, _) in user_ratings) for w in user_ratings: abcd.append(w) myArray.append([uid, abcd]) print(myArray) return render_template('secondpage.html', returned={'data': myArray}) return ('results working')
class ItemCF(): def __init__(self): file_path = os.path.expanduser('user_item_rate.csv') reader = Reader(line_format='user item rating', sep=',') surprise_data = Dataset.load_from_file(file_path, reader=reader) all_trainset = surprise_data.build_full_trainset() # 训练模型:基于项目相似度 self.item_algo = KNNBasic(k=10, min_k=3, sim_options={'user_based': False}) # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline self.item_algo.fit(all_trainset) def get_similar_items(self, top_k, item_id): """ 相似项目 Args: top_k(int): 相似项目数量 item_id(str): 项目id Returns: list generator """ item_inner_id = self.item_algo.trainset.to_inner_iid(item_id) item_neighbors = self.item_algo.get_neighbors(item_inner_id, k=top_k) item_neighbor_ids = (self.item_algo.trainset.to_raw_iid(inner_id) for inner_id in item_neighbors) return item_neighbor_ids
def train_model(): histories = requests.get( 'https://whispering-refuge-67560.herokuapp.com/api/histories') history_data = json.loads(histories.content.decode('utf-8')) data_train = pd.DataFrame.from_dict(history_data, orient='columns') data_train.drop( columns=['booking_id', 'createdAt', 'history_id', 'updatedAt']) data_train = data_train[['tid', 'gid', 'rating']] sim_options = {'name': 'cosine', 'user_based': False} global algo algo = KNNBasic(sim_options=sim_options) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(0, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(data_train[['tid', 'gid', 'rating']], reader) # sample random trainset and testset # test set is made of 25% of the ratings. trainingSet = data.build_full_trainset() #trainset, testset = train_test_split(data, test_size=.25) # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainingSet) return jsonify(status="training in progress") global all_guides all_guides = [] get_all_guides()
def content(self): # content based surprise_data = self.prepare_Data() if surprise_data == []: print("No data provided") return sim_options = { 'name': 'cosine', 'user_based': False # compute similarities between items } algo = KNNBasic(sim_options=sim_options) trainset = surprise_data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) recommendation = self.get_top_n(predictions) new_list = [] k = 0 for i, j in recommendation[self.user_id]: data_to_append = {} data_to_append.update({'id': k}) data_to_append.update({'business id': i}) new_list.append(data_to_append) k += 1 recommend = {} recommend = {item['id']: item for item in new_list} return (recommend)
def collaborative_filtering(): history_list = History.objects.all() with open('recommend/dataset_cf.csv', 'w', encoding='utf-8', newline='') as csv_file: header = ['history_id', 'user_id', 'alco_name', 'data_joined', 'review'] writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.writerow(header) for history in history_list: row = [] row += [history.history_id, history.user_id, history.alco_name, history.data_joined, history.review] writer.writerow(row) alco = pandas.read_csv("recommend/alcohol_cf.csv", encoding='utf-8') alco = alco.set_index('alco_name') data = pandas.read_csv("recommend/dataset_cf.csv", encoding='utf-8').fillna(0) data = data.drop('history_id', axis=1) data = data.drop('data_joined', axis=1) alcohol_id_list = [] for i in range(len(data.index)): alcohol_id_list.append(alco.at[data['alco_name'][i], 'alcohol_id']) data = data.drop('alco_name', axis=1) data['alcohol_id'] = alcohol_id_list data = data.loc[:, ["user_id", "alcohol_id", "review"]] data.to_csv("recommend/dataset_cf.score", sep=' ', header=None, index=False, encoding='utf-8') reader = Reader(line_format='user item rating', sep=' ') dataset = Dataset.load_from_file("recommend/dataset_cf.score", reader=reader) trainset = dataset.build_full_trainset() sim_options = { 'name': 'pearson', # 類似度を計算する方法を指定( cosine,msd,pearson,pearson_baseline ) 'user_based': True # False にするとアイテムベースに } algo = KNNBasic(k=5, min_k=1, sim_options=sim_options) algo.fit(trainset) # algo = SVD() # algo.train(trainset) # print(algo.sim) alcohol_num = Alcohol.objects.latest('alcohol_id').alcohol_id user_num = History.objects.latest('user_id').user_id with open('recommend/answer_cf.csv', 'w', encoding='utf-8', newline='') as csv_file: header = ['user_id', 'alcohol_id', 'predicted_value'] writer = csv.writer(csv_file, quoting=csv.QUOTE_ALL) writer.writerow(header) for j in range(1, user_num + 1): user_id = j for i in range(1, alcohol_num + 1): item_id = i pred = algo.predict(uid=str(user_id), iid=str(item_id)) row = [] row += [pred.uid, pred.iid, pred.est] writer.writerow(row)
def collaborative_filter(id, new_words): ratings_dict = calc_collaborative_param(new_words, id) df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is required. reader = Reader(rating_scale=(0.0, 5.0)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # define a cross-validation iterator kf = KFold(n_splits=3) algo = KNNBasic() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) kf_predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(kf_predictions, verbose=True) trainset = data.build_full_trainset() new_data = trainset.build_anti_testset() predictions = algo.test(new_data) top_n = get_top_n(predictions, n=3) with open('top_n.json', 'w') as fp: dump(top_n, fp, indent=4) return top_n
def run_KNN(x_train, x_test, k): reader = Reader(rating_scale=(1, 5)) data_train_df = Dataset.load_from_df( x_train[['userId', 'movieId', 'rating']], reader) data_test_df = Dataset.load_from_df( x_test[['userId', 'movieId', 'rating']], reader) data_train = data_train_df.build_full_trainset() data_test = data_test_df.build_full_trainset() data_testset = data_test.build_testset() algo = KNNBasic() algo.fit(data_train) pr = algo.test(data_testset) rec = format_baselines(pr) seen = format_baselines_apk(pr, x_test) predicted, actual = format_baselines_third(pr, x_test) print(predicted) print(actual) print(f'Alternative Precision {recommender_precision(predicted, actual)}') print(f'Alternative Recall {recommender_recall(predicted, actual)}') print(f'APK: {yallah(seen, k)}') precisions, recalls = precision_recall_at_k(rec, k) print( f'|KNN : Precision| = {sum(prec for prec in precisions.values()) / len(precisions)}' ) print( f'|KNN : Recall| = {sum(rec for rec in recalls.values()) / len(recalls)}' )
def user_based_rec_loader(data, ml, userID, no_recs): trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True } model = KNNBasic(sim_options=sim_options) model.fit(trainSet) similarity_matrix = model.compute_similarities() userIDInnerID = trainSet.to_inner_uid(userID) similiarty_row = similarity_matrix[userIDInnerID] # removing the testUser from the similiarty_row similarUsers = [] for innerID, score in enumerate(similiarty_row): if (innerID != userIDInnerID): similarUsers.append( (innerID, score)) # find the k users largest similarities k = 15 kNeighbours = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # or can tune for ratings > threshold # kNeighbours = [] # for rating in similarUsers: # if rating[1] > 4.0: # kNeighbours.append(rating) results = get_recommendations(ml, no_recs, trainSet, similarity_matrix, kNeighbours, userIDInnerID, rec_type = 'user') return results
def item_based_rec_loader(data, ml, userID, no_recs): trainSet = data.build_full_trainset() # note that user_base: False here, thus we are telling KNN that # we want to generate an item-item based similarity matrix sim_options = {'name': 'cosine', 'user_based': False } model = KNNBasic(sim_options=sim_options) model.fit(trainSet) similarity_matrix = model.compute_similarities() userIDInnerID = trainSet.to_inner_uid(userID) # Get the top K items we rated k = 15 userIDRatings = trainSet.ur[userIDInnerID] kNeighbours = heapq.nlargest(k, userIDRatings, key=lambda t: t[1]) # kNeighbours = [] # userIDRatings = trainSet.ur[userIDInnerID] # for rating in userIDRatings: # if rating[1] > 4.0: # kNeighbours.append(rating) results = get_recommendations(ml, no_recs, trainSet, similarity_matrix, kNeighbours, userIDInnerID, rec_type = 'item') return results
def use_cosine_similarity(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using cosine similarity') sim_options = { 'name': 'cosine', 'user_based': False # compute similarities between items } algo_cosine = KNNBasic(sim_options=sim_options) algo_cosine.fit(trainset) testset = trainset.build_anti_testset() predictions_KNN = algo_cosine.test(testset) accuracy_rmse = accuracy.rmse(predictions_KNN) accuracy_mae = accuracy.mae(predictions_KNN) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def KNN_Tester(trainset, testset, algo): param_grid = { 'k': [50, 100], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'] } } gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5) gs.fit(data) params = gs.best_params['rmse'] algo = KNNBasic(k=params['k'], sim_options=params['sim_options']) algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall = sum(rec for rec in recalls.values()) / len(recalls) metrics = { 'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params } return metrics
class KNN_Basic(BaseSurpriseSTLEstimator): """ Args: :attr:`k` (int): number of neighbors :attr:`sim_options` (optional): option from surprise for a similarity metric """ def __init__(self, k, name='KNN_Basic', sim_options=None): super().__init__(name, 'non_feature_based') self.k = k if sim_options is not None: self.model = KNNBasic(k=self.k, verbose=False, sim_options=sim_options) else: self.model = KNNBasic(k=self.k, verbose=False) def _fit(self, x): self.model.fit(x) def _predict(self, x): return self.model.test(x) def get_hyper_params(self): hparams = {'k': {'type': 'integer', 'values': [2, 13]}} return hparams def set_hyper_params(self, **kwargs): self.k = kwargs['k'] def similarity_matrix(self): return self.model.compute_similarities()
def train_item_rec_sys(): """ Trains KNNBasic Model. Yields ------ similar_items_algo.pkl """ item_rec_sys_data = pd.read_csv("datasets/item_rec_sys_data.csv") # Creating Data object. reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df=item_rec_sys_data, reader=reader) trainset = data.build_full_trainset() # Training Algorithm. sim_options = {"name": "cosine", "user_based": False} algo = KNNBasic(k=10, sim_options=sim_options, verbose=False) algo.fit(trainset) # Extract inner id mappings. _compute_inner_item_ids(item_rec_sys_data, algo=algo, trainset=trainset) # Saving Algorithm. file_path = Path.cwd() / "models/similar_items_algo.pkl" dump.dump(file_path, algo=algo) return
def cal_KNNBasic(trainset, df): # KNNBasic sim_options = {'name': 'cosine', 'user-based': True} algo_knnb = KNNBasic(k=40, min_k=1, sim_options=sim_options) algo_knnb.fit(trainset) users = [] items = [] real = [] estimate = [] for i in range(len(df)): uid = df[i:i + 1].user.values[0] users.append(uid) iid = df[i:i + 1].store.values[0] items.append(iid) r_ui = df[i:i + 1].stars.values[0] real.append(r_ui) pred = algo.predict(uid, iid, r_ui, verbose=True) estimate.append(pred) print("end") # knn basic df3 = pd.DataFrame(columns=['user', 'item', 'r_ui', 'est']) df3['user'] = users df3['item'] = items df3['r_ui'] = real df3['est'] = estimate #df3.head() df3['est'] = df3['est'].apply(lambda x: x[-2]) df3['err'] = abs(df3.est - df3.r_ui) df3.to_csv(save_file2)
def predict_ratings(data): """ 可以简单地将算法适合整个数据集, 而不是运行交叉验证。 这可以通过使用build_full_trainset()将创建trainset对象的方法来完成 可以通过直接调用该predict()方法来预测收视率 :return: """ trainset = data.build_full_trainset() svg = SVD() svg.fit(trainset) testset = trainset.build_anti_testset() predictions = svg.test(testset) algo = KNNBasic() algo.fit(trainset) #收视率预测:假设对用户196和项目302感兴趣(确保它们在trainset中!),并且知道真实的评分rui=4 uid = str(196) iid = str(302) # algo.predict(uid,iid,r_ui=4,verbose=True) return predictions
def create_KNNmodel(trainset, k=50, min_k=5, user_based=True, random_state=12345): """Train the KNN model given a training set and model parameters Arguments: trainset {surprise.trainset.Trainset} -- training set, output from build_trainset k_tuned {int} -- number of neighbors, parameter for algorithm (default: {50}) min_k_tuned{int} -- minimum neighbors, parameter for algorithm (default: {5}) user_based_tuned {bool} -- user based or not, parameter for algorithm (default: {True}) seed {int} -- random seed (default: {12345}) Returns: model {surprise.prediction_algorithms.knns.KNNBasic} -- trained model object """ model = KNNBasic(k=k, min_k=min_k, user_based=user_based, random_state=random_state) #created KNNmodel model.fit(trainset) return model
def train(self, df, model_path=''): ''' 协同过滤模型训练 :param df: 格式包含该三列 --》 userid,iteamid,rating :param k: 聚类得类别数量 :param min_k: 最小聚类数量 :param sim_name:相似度量指标,默认余弦相似度 :param user_based:协同过滤基准,默认 itemBase 的协同过滤 :param model_path:模型持久化地址,默认为空,不执行持久化 :return: 训练好的模型 ''' print('begin to train') # 数据类型转换为 surprise 需要的格式 data = Dataset.load_from_df(df, self.reader) trainset = data.build_full_trainset() # itemBase 的协同过滤KNN模型的训练和持久化 algo_knnbasic = KNNBasic(k=self.k, min_k=self.min_k, sim_options={ 'name': self.sim_name, 'user_based': self.user_based }, verbose=True) algo_knnbasic.fit(trainset) if model_path: surprise.dump.dump(model_path, algo=algo_knnbasic, verbose=1) return algo_knnbasic
def Basic_CF(self): u_id = [] I_id = [] r_ui_ = np.array([]) _est = np.array([]) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBasic(k=40, min_k=1, sim_options=sim_options) algo.fit(self.trainset) for uid in (self.list): lids = self.data[self.data.uid == uid] a = self.data[self.data.uid == uid] for i in range(1, len(a)): lid = lids[i - 1:i].lid.values[0] r_ui = lids[i - 1:i].rate.values[0] pred = algo.predict(uid, lid, r_ui, verbose=True) u_id.append(int(pred.uid)) I_id.append(int(pred.iid)) r_ui_ = np.append(r_ui_, pred.r_ui) _est = np.append(_est, pred.est) self.df_est = pd.DataFrame({ 'uid': u_id, 'Iid': I_id, 'r_ui': r_ui_, 'est': _est }) self.arr = self.df_est['uid'].unique() self.CF_ndcg_ = self.Calculate_NDCG()
def use_pearson_baseline(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using Pearson baseline') sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } algo_pearson = KNNBasic(sim_options=sim_options) algo_pearson.fit(trainset) testset = trainset.build_anti_testset() predictions_KNN = algo_pearson.test(testset) accuracy_rmse = accuracy.rmse(predictions_KNN) accuracy_mae = accuracy.mae(predictions_KNN) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def rodar_modelo(data, teste_tamanho, sim_opcoes, k): treina, testa = train_test_split(data, teste_tamanho) knn = KNNBasic(k=k, sim_options=sim_opcoes) knn.fit(treina) knn_predicoes = knn.test(testa) accuracy.rmse(knn_predicoes) return knn
def detail(request, post_id): # 예상평점 알고리즘 넣기 file_path = os.path.expanduser('stars.csv') reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() algo = KNNBasic() algo.fit(trainset) uid = str(request.user.is_authenticated) # 유저아이디 적어야함 iid = str(post_id) # raw item id (as in the ratings file). They are **strings**! pred = algo.predict(uid, iid, r_ui=4, verbose=True) # 예상평점 group = Matzip_list.objects.get(id=post_id) if not request.user.is_anonymous: if request.user.star_set.all().filter(matzip_id=post_id).first(): my_rate = request.user.star_set.all().filter(matzip_id=post_id).first().rate is_rated = 1 else: my_rate = pred is_rated = 0 else: my_rate = "로그인을 해주세요" is_rated = 2 images = re.sub("]|\[|'", "", group.images_url_preprocess).strip().split(',') context = { 'group': group, 'images': images, 'my_rate': my_rate, 'is_rated': is_rated, 'pred': pred, } return render(request, 'posts/detail.html', context)
def simpleItemCFGive(id): testSubject = str(id) k = 10 ml = MovieLens() data = ml.loadMovieLensLatestSmall() trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() testUserInnerID = trainSet.to_inner_uid(testSubject) # Get the top K items we rated testUserRatings = trainSet.ur[testUserInnerID] kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1]) # Get similar items to stuff we liked (weighted by rating) candidates = defaultdict(float) for itemID, rating in kNeighbors: similarityRow = simsMatrix[itemID] for innerID, score in enumerate(similarityRow): candidates[innerID] += score * (rating / 5.0) # Build a dictionary of stuff the user has already seen watched = {} for itemID, rating in trainSet.ur[testUserInnerID]: watched[itemID] = 1 # Get top-rated items from similar users: s = "\n" + str(id) pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: movieID = trainSet.to_raw_iid(itemID) s += "," + ml.getMovieName(int(movieID)) pos += 1 if (pos > 10): break file = open("E:\\Neeraj\\SimpleItemCFBase.txt", "r") alld = file.readlines() file.close() file1 = open("E:\\Neeraj\\SimpleItemCFBase.txt", "w") for r1 in alld: print(r1) u = r1.find(",") if (r1[0:u] == str(id)): pass else: file1.write(r1) file1.write(s) file1.close() print("\nDone")
def get_accuracy(df, genre, neighbors=30, min_neighbors=5, seed=12345, kfolds=5, k=5, threshold=4): """ Gets the precision and accuracy of the model for each genre using cross validation Args: df (pandas.DataFrame): the dataset of actual ratings genre (str): the genre for the model neighbors (int): the number of neighbors to take into account when training the model Default is 30. min_neighbors (int): the number of neighbors a user must have in order to get a prediction. Default is 5. seed (int): setting the random state. Default is 12345. kfolds (int): the number of folds for cross validation. Default is 5. k (int): number of recommendations for each user. default is 5. threshold (int): the cutoff rating at which an item will be considered 'enjoyed.' Returns: prec (int): The average of precision across the kfolds cross validation rec (int): The average of recall across the kfolds cross validation """ data = df[df['genre'] == genre] data = data[['user_id', 'book_id', 'rating']] reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader) algo_KNNbasic = KNNBasic(k=neighbors, min_k=min_neighbors, random_state=seed) kf = KFold(n_splits=kfolds, random_state=seed) prec_list = [] recalls_list = [] for trainset, testset in kf.split(data): algo_KNNbasic.fit(trainset) predictions = algo_KNNbasic.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold) # Precision and recall can then be averaged over all users logger.info("Precision:") logger.info( sum(prec for prec in precisions.values()) / len(precisions)) precision = (sum(prec for prec in precisions.values()) / len(precisions)) logger.info("Recall") logger.info(sum(rec for rec in recalls.values()) / len(recalls)) recall = (sum(rec for rec in recalls.values()) / len(recalls)) prec_list.append(precision) recalls_list.append(recall) prec = (sum(prec_list) / len(prec_list)) rec = (sum(recalls_list) / len(recalls_list)) return prec, rec
class KNN_ensemble: def __init__(self, mode=0): # self.movie = Movie_KNN_recommender() self.user = Personal_KNN_recommender() self.index = pd.read_csv('../data/personal/movies.csv') self.reader = Reader() self.ratings = pd.read_csv('../data/personal/ratings.csv') data = Dataset.load_from_df( self.ratings[['userId', 'movieId', 'rating']], self.reader) trainset = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} if mode == 0: self.algo = KNNBaseline(sim_options=sim_options) elif mode == 1: self.algo = KNNWithMeans(sim_options=sim_options) elif mode == 2: self.algo = KNNBasic(sim_options=sim_options) else: exit(0) self.algo.fit(trainset) self.sim = self.algo.compute_similarities() def cal_similarity(self, movieID, waitingID): movie_inner_id = self.algo.trainset.to_inner_iid(movieID) waiting_inner_id = self.algo.trainset.to_inner_iid(waitingID) return self.sim[movie_inner_id, waiting_inner_id] def showSeenMovies(self, usrID): print("\n\nThe user has seen movies below: ") movies = [] for i in range(len(self.ratings['userId'])): if self.ratings['userId'][i] == usrID: movies.append(self.index[self.index.movieId == self.ratings['movieId'][i]]['title']) for i in movies: print(i.values[0]) def showInputMovie(self, movieID): print("\n\nThe user's input movie is: ") print(self.index[self.index.movieId == movieID]['title']) print('\n\n') def recommend(self, usrID, movieID, num=10): self.showSeenMovies(usrID) self.showInputMovie(movieID) _, first_ids = self.user.recommend(usrID, 50) similarity = {} for i in first_ids: similarity[i] = self.cal_similarity(movieID, i) result = sorted(similarity.items(), key=lambda x: x[1], reverse=True) # 对相似度进行排序 result = result[:num] movie = [] for i in result: movie.append(self.index[self.index.movieId == i[0]]['title']) return movie
def item_based_rec_loader(data, testUser, no_recs): trainSet = data.build_full_trainset() # note that user_base: False here, thus we are telling KNN that # we want to generate an item-item based similarity matrix sim_options = {'name': 'cosine', 'user_based': False} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) similarity_matrix = model.compute_similarities() testUserInnerID = trainSet.to_inner_uid(testUser) # Get the top K items we rated # k = 10 # testUserRatings = trainSet.ur[testUserInnerID] # kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1]) #or look for items with rating > threshold kNeighbors = [] testUserRatings = trainSet.ur[testUserInnerID] for rating in testUserRatings: if rating[1] > 4.0: kNeighbors.append(rating) # Get similar items to stuff we liked (weighted by rating) candidates = defaultdict(float) for itemID, rating in kNeighbors: similarity_row = similarity_matrix[itemID] for innerID, score in enumerate(similarity_row): candidates[innerID] += score * (rating / 5.0) # Build a dictionary of stuff the user has already seen excluded = {} for itemID, rating in trainSet.ur[testUserInnerID]: excluded[itemID] = 1 # Build a dictionary for results results = {'book': [], 'rating_sum': []} # Get top-rated items from similar users: print('\n') pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in excluded: bookID = trainSet.to_raw_iid(itemID) # print(ml.getItemName(int(bookID)), ratingSum) results['book'].append(ml.getItemName(int(bookID))) results['rating_sum'].append(ratingSum) pos += 1 if (pos > no_recs - 1): break return pd.DataFrame(results)
def main(): data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.25) algo = KNNBasic() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE score = accuracy.rmse(predictions) print('rmse: ', score)
def f_rs_cr_sim_matrix(train_set, xlsx_file, user_base, mtx_measure, df_idx): sim_opt = {'name': mtx_measure, 'user_based': user_base } model = KNNBasic(sim_options=sim_opt, verbose = False) model.fit(train_set) simsMatrix = model.compute_similarities() Print ("Complete Matrix -", mtx_measure) df = pd.DataFrame(simsMatrix) df.columns = df_idx; df_index = df_idx df.to_excel(xlsx_file) return df
def train_model(): file_path = os.path.expanduser('user_item_rate.csv') reader = Reader(line_format='user item rating', sep=',') surprise_data = Dataset.load_from_file(file_path, reader=reader) all_trainset = surprise_data.build_full_trainset() algo = KNNBasic( k=40, min_k=3, sim_options={'user_based': True} ) # sim_options={'name': 'cosine','user_based': True} cosine/msd/pearson/pearson_baseline algo.fit(all_trainset) return algo
def fit(self): self.dl = DataLoader() data = self.dl.load_rating_matrix() self.train_set = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} knn_basic = KNNBasic(sim_options=sim_options) knn_basic.fit(self.train_set) self.sim_matrix = knn_basic.compute_similarities()
def run_collaborative_filtering(): global top_recommendations global knn data = Dataset.load_builtin("ml-100k") training_set = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} knn = KNNBasic(sim_options=sim_options) knn.fit(training_set) test_set = training_set.build_anti_testset() predictions = knn.test(test_set) top_recommendations = get_top_recommendations(predictions) return 'OK'
def test_nearest_neighbors(): """Ensure the nearest neighbors are different when using user-user similarity vs item-item.""" reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data_file = os.path.dirname(os.path.realpath(__file__)) + '/custom_train' data = Dataset.load_from_file(data_file, reader, rating_scale=(1, 5)) trainset = data.build_full_trainset() algo_ub = KNNBasic(sim_options={'user_based': True}) algo_ub.fit(trainset) algo_ib = KNNBasic(sim_options={'user_based': False}) algo_ib.fit(trainset) assert algo_ub.get_neighbors(0, k=10) != algo_ib.get_neighbors(0, k=10)