class SimpleCF: def __init__(self, df, user_based=False): self.df = df self.user_based = user_based reader = Reader(line_format='user item rating') data = Dataset.load_from_df(df=self.df, reader=reader) self.eval_data = EvaluationData(data) sim_options = {'name': 'cosine', 'user_based': self.user_based} self.model = KNNBasic(sim_options=sim_options) def item_based_cf(self, k=10, eval=False): topN = defaultdict(list) testSet = self.eval_data.GetLOOCVTestSet() if eval == False: trainSet = self.eval_data.GetFullTrainSet() else: trainSet = self.eval_data.GetLOOCVTrainSet() self.model.fit(trainSet) simsMatrix = self.model.compute_similarities() for uiid in range(trainSet.n_users()): testUserRatings = trainSet.ur[uiid] KNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1]) candidates = defaultdict(float) for itemID, rating in KNeighbors: similarityRow = simsMatrix[itemID] for item_innerID, item_score in enumerate(similarityRow): candidates[item_innerID] += item_score * (rating / 5.0) watched = {} for itemID, rating in trainSet.ur[uiid]: watched[itemID] = 1 pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: topN[trainSet.to_raw_uid(uiid)].append( (trainSet.to_raw_iid(itemID), ratingSum)) pos += 1 if pos > 10: break if eval == False: return topN else: return RecommenderMetrics.HitRate(topN, testSet) def user_based_cf(self, k=10, eval=True): topN = defaultdict(list) testSet = self.eval_data.GetLOOCVTestSet() if eval == False: trainSet = self.eval_data.GetFullTrainSet() else: trainSet = self.eval_data.GetLOOCVTrainSet() self.model.fit(trainSet) simsMatrix = self.model.compute_similarities() for uiid in range(trainSet.n_users): similarityRow = simsMatrix[uiid] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != uiid): similarUsers.append((innerID, score)) KNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) candidates = defaultdict(float) for similarUser in KNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += userSimilarityScore * (rating[1] / 5.0) watched = {} for itemID, rating in trainSet.ur[uiid]: watched[itemID] = 1 pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: topN[trainSet.to_raw_uid(uiid)].append( (trainSet.to_raw_iid(itemID), ratingSum)) pos += 1 if pos > 10: break if eval == False: return topN else: return RecommenderMetrics.HitRate(topN, testSet)
from surprise import KNNBasic import heapq from collections import defaultdict from operator import itemgetter testSubject = '85' k = 10 # Load our data set and compute the user similarity matrix ml = MovieLens() data = ml.loadMovieLensLatestSmall() trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() # Get top N similar users to our test subject # (Alternate approach would be to select users up to some similarity threshold - try it!) testUserInnerID = trainSet.to_inner_uid(testSubject) similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != testUserInnerID): similarUsers.append((innerID, score)) kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # Get the stuff they rated, and add up ratings for each item, weighted by user similarity candidates = defaultdict(float)
def runUserColaborativeFiltering(testSubject = "85", k = 14): # Load our data set and compute the user similarity matrix ml = MovieLens() data = ml.loadMovieLensLatestSmall() # the data is in surprise.dataset.DatasetAutoFolds fromat # to get the raw ratings use data.raw_ratings the format is --> userID movieID rating Timestamp # Trainsets are different from Datasets. You can think of a Dataset as the raw data, # and Trainsets as higher-level data where useful methods are defined. # build_full_trainset() method will build a trainset object for the entire dataset trainSet = data.build_full_trainset() # Options for similarity calculations sim_options = {'name': 'cosine', 'user_based': True} model = KNNBasic(sim_options=sim_options) # Fit must be called on a transient only not directly on the raw data model.fit(trainSet) simsMatrix = model.compute_similarities() # Get top N similar users to our test subject # (Alternate approach would be to select users up to some similarity threshold ) testUserInnerID = trainSet.to_inner_uid(testSubject) similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != testUserInnerID): similarUsers.append( (innerID, score) ) kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # Get the stuff they rated, and add up ratings for each item, weighted by user similarity candidates = defaultdict(float) for similarUser in kNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore # Build a dictionary of stuff the user has already seen watched = {} for itemID, rating in trainSet.ur[testUserInnerID]: watched[itemID] = 1 # Get top-rated items from similar users: recommendations = [] pos = 0 print("\n\n-------------------<><><><>--------------------") for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: movieID = trainSet.to_raw_iid(itemID) movieID = float(movieID) movieID = int(movieID) recommendations.append(int(movieID)) print(ml.getMovieName(int(movieID)), ratingSum) pos += 1 if (pos > 20): break print("-------------------<><><><>--------------------") # these are the id in the movie lens dataset return recommendations
def getRecc(self, testSubject): print("Making Recommendation for user:"******"\nWatched:", sorted(watchedList)) # Get top-rated items from similar users: print("\nCollab Filt Recc:") pos = 0 finalReccs = [] for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: movieID = trainSet.to_raw_iid(itemID) finalReccs.append(movieID) print(movieID) pos += 1 if (pos > 8): break return (finalReccs)
def recommendations(request): if Member.objects.filter(user=request.user).first() is None: messages.warning(request, 'You Need to first Update your profile.') return redirect('profile') testSubject = str(request.user.id) k = 10 try: bk = BooksData('data/') data = bk.loadBooksData() trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() simsMatrix = np.nan_to_num(simsMatrix) # print(simsMatrix) # print(type(simsMatrix)) # Get top N similar users to our test subject testUserInnerID = trainSet.to_inner_uid(testSubject) if sim_options['user_based']: similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if innerID != testUserInnerID: similarUsers.append((innerID, score)) kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) candidates = defaultdict(float) for similarUser in kNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += (rating[1] / 10.0) * userSimilarityScore else: testUserRatings = trainSet.ur[testUserInnerID] kNeighbors = heapq.nlargest(k, testUserRatings, key=lambda t: t[1]) candidates = defaultdict(float) for itemID, rating in kNeighbors: similarityRow = simsMatrix[itemID] for innerID, score in enumerate(similarityRow): candidates[innerID] += score * (rating / 10.0) # Get the stuff they rated, and add up ratings for each item, weighted by user similarity # Build a dictionary of stuff the user has already read read = {} # print('\n\nBooks user already read.') # print("============================") for itemID, rating in trainSet.ur[testUserInnerID]: bookID = trainSet.to_raw_iid(itemID) # print(bk.getBookName(bookID)) read[itemID] = 1 # Get top-rated items from similar users: pos = 0 bks2 = [] for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in read: bookID = trainSet.to_raw_iid(itemID) # print(bk.getBookName(bookID)) bks2.append(bookID) pos += 1 if (pos > 10): break UCB = [] for _ in bks2: UCB.append(Book.objects.get(ISBN=_)) # SVD Algorithms def GetAntiTestSetForUser(testSubject, trainSet): fill = trainSet.global_mean anti_testset = [] u = trainSet.to_inner_uid(str(testSubject)) user_items = set([j for (j, _) in trainSet.ur[u]]) anti_testset += [(trainSet.to_raw_uid(u), trainSet.to_raw_iid(i), fill) for i in trainSet.all_items() if i not in user_items] return anti_testset model = SVD() model.fit(trainSet) testSet = GetAntiTestSetForUser(testSubject, trainSet) predictions = model.test(testSet) recommendations = [] for userID, ISBN, actualRating, estimatedRating, _ in predictions: isbn = ISBN recommendations.append((isbn, estimatedRating)) recommendations.sort(key=lambda x: x[1], reverse=True) SVDB = [] for ratings in recommendations[:k]: # print(bk.getBookName(ratings[0])) SVDB.append(Book.objects.get(ISBN=ratings[0])) except: UCB = [] SVDB = [] return render(request, 'LibraryMS/recommendations.html', { 'UCB': UCB, 'SVDB': SVDB })
from resources.RunDataLoader import run_data_loader from surprise import KNNBasic import numpy as np import pickle import pandas as pd ml = run_data_loader() no_ratings = len(ml.ratings_df) no_ratings = 5000000 print('Number of ratings:' + str(no_ratings)) data = ml.loadData(no_ratings) trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) similarity_matrix = model.compute_similarities() # mlPath = 'data/ml.pkl' # with open(mlPath, 'wb') as file: # pickle.dump(ml, file) np.save('data/similarity_matrix', similarity_matrix) trainSetPath = 'data/trainSet.pkl' with open(trainSetPath, 'wb') as file: pickle.dump(trainSet, file)
def user_based_rec_loader(data, testUser, no_recs): trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True } model = KNNBasic(sim_options=sim_options) model.fit(trainSet) similarity_matrix = model.compute_similarities() testUserInnerID = trainSet.to_inner_uid(testUser) similiarty_row = similarity_matrix[testUserInnerID] # removing the testUser from the similiarty_row similarUsers = [] for innerID, score in enumerate(similiarty_row): if (innerID != testUserInnerID): similarUsers.append( (innerID, score) ) # find the k users largest similarities k = 10 kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # or can tune for ratings > threshold # kNeighbors = [] # for rating in similarUsers: # if rating[1] > 0.75: # kNeighbors.append(rating) # Get the stuff the k users rated, and add up ratings for each item, weighted by user similarity # candidates will hold all possible items(movies) and combined rating from all k users candidates = defaultdict(float) for similarUser in kNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] # this will hold all the items they've rated and the ratings for each of those items theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore # Build a dictionary of stuff the user has already seen excluded = {} for itemID, rating in trainSet.ur[testUserInnerID]: excluded[itemID] = 1 # Build a dictionary for results results = {'book_title': [], 'rating_sum': []} # Get top-rated items from similar users: print('\n') pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if itemID not in excluded: bookID = trainSet.to_raw_iid(itemID) # print(ml.getItemName(int(bookID)), ratingSum) results['book_title'].append(ml.getItemName(int(bookID))) results['rating_sum'].append(ratingSum) pos += 1 if (pos > no_recs -1): break return pd.DataFrame(results)
def simpleUserCFGive(id): testSubject = str(id) k = 10 # Load our data set and compute the user similarity matrix ml = MovieLens() data = ml.loadMovieLensLatestSmall() trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() # Get top N similar users to our test subject # (Alternate approach would be to select users up to some similarity threshold - try it!) testUserInnerID = trainSet.to_inner_uid(testSubject) similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != testUserInnerID): similarUsers.append((innerID, score)) kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # Get the stuff they rated, and add up ratings for each item, weighted by user similarity candidates = defaultdict(float) for similarUser in kNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore # Build a dictionary of stuff the user has already seen watched = {} for itemID, rating in trainSet.ur[testUserInnerID]: watched[itemID] = 1 # Get top-rated items from similar users: s = "\n" + str(id) pos = 0 for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: movieID = trainSet.to_raw_iid(itemID) s += "," + ml.getMovieName(int(movieID)) pos += 1 if (pos > 10): break file = open("E:\\Neeraj\\SimpleUserCFBase.txt", "r") alld = file.readlines() file.close() file1 = open("E:\\Neeraj\\SimpleUserCFBase.txt", "w") for r1 in alld: print(r1) u = r1.find(",") if (r1[0:u] == str(id)): pass else: file1.write(r1) file1.write(s) file1.close() print("\nDone")
def __calc_sim_matrix(self): algo = KNNBasic(sim_options=self.sim_options) algo.fit(self.trainset) self.similarity_matrix = algo.compute_similarities()
def computeNovelCf(userid): testSubject = userid k = 10 # Load our data set and compute the user similarity matrix ml = NovelLens() data = ml.loadNovelLensLatestSmall() trainSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': True} model = KNNBasic(sim_options=sim_options) model.fit(trainSet) simsMatrix = model.compute_similarities() # Get top N similar users to our test subject # (Alternate approach would be to select users up to some similarity threshold - try it!) testUserInnerID = trainSet.to_inner_uid(testSubject) similarityRow = simsMatrix[testUserInnerID] similarUsers = [] for innerID, score in enumerate(similarityRow): if (innerID != testUserInnerID): similarUsers.append((innerID, score)) kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1]) # Get the stuff they rated, and add up ratings for each item, weighted by user similarity candidates = defaultdict(float) for similarUser in kNeighbors: innerID = similarUser[0] userSimilarityScore = similarUser[1] theirRatings = trainSet.ur[innerID] for rating in theirRatings: candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore # Build a dictionary of stuff the user has already seen watched = {} for itemID, rating in trainSet.ur[testUserInnerID]: watched[itemID] = 1 # Get top-rated items from similar users: pos = 0 noveldatapro = [] novels = [] for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True): if not itemID in watched: novelID = trainSet.to_raw_iid(itemID) noveldatapro.append(novelID) print(ml.getNovelName(int(novelID)), ratingSum) novels.append(ml.getNovelName(int(novelID))) pos += 1 if (pos > 9): print("The top 10 novels for the user: " + testSubject) print(noveldatapro) break return novels
def generate_sim_matrix(trainSet, sim_metric, is_user=True): sim_options = {'name': sim_metric, 'user_based': is_user} model = KNNBasic(sim_options=sim_options, verbose=False) model.fit(trainSet) simsMatrix = model.compute_similarities() return simsMatrix