class RecommendationSystem(): # To run on your own machine, you need to initialize with your datapath to the frontend folder def __init__( self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath + rating_file self.complete_rating_file = datapath + complete_rating_file self.movie_file = datapath + movie_file self.detail_file = datapath + detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath + model) self.svd.load_data(filename=self.rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile( self.complete_rating_file).map(lambda line: line.split(",")).map( lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath + 'tables/movies') self.detail_df = self.sqlContext.read.load(datapath + 'tables/detail') self.rating_df = self.sqlContext.read.load(datapath + 'tables/ratings') # call this function to get all recommendations def get_all_recomm(self, userid, moviename): movieid = self.get_movie_id(moviename) # all recommendation algorithms return a list of movie ids recom1 = self.svd_recomm(userid, only_unknown=True) recom2 = self.svd_similar(movieid) recom3 = self.als_new(userid) #get info about the movie based on movie ids brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) # print to terminal for l1 in brief_info1: print l1 for l2 in brief_info2: print l2 for l3 in brief_info3: print l3 return [brief_info1, brief_info2, brief_info3] # get movie id based on movie name input def get_movie_id(self, moviename): r = self.movie_df.where( self.movie_df['name'].startswith(moviename)).first() # return movie id 1 if not found if r is None: return 1 return r['movieId'] # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None # output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list # svd recommendation algorithm based on similar movie def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if int(row_item[0]) == movieid: movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list # this ALS recommendation algorithm did not get to present to front end # future work is needed to improve this algorithm def als_recomm(self, userid): user_movie_ratings = [ 16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261, 277 ] unrated_movies = self.movie_data.filter(lambda x: x[ 0] not in user_movie_ratings).map(lambda x: (userid, x[0])) recommended_movies_rdd = self.als_model.predictAll(unrated_movies) # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions user_recommended_ratings_rdd = recommended_movies_rdd.map( lambda x: (x.product, x.rating)) movie_ID_with_ratings_RDD = self.ratings_data.map( lambda x: (x[1], x[2])).groupByKey() movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map( get_counts_and_averages) movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map( lambda x: (x[0], x[1][0])) user_recommended_movies_ratings_count_rdd = ( user_recommended_ratings_rdd.join(movie_rating_counts_rdd) ).map(lambda l: (l[0], l[1][0], l[1][1])) recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter( lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1]) return recommended_movies_list # an ALS recommendation algorithm based on user rating history def als_new(self, userid): recommended_movies = self.als_model.recommendProducts(userid, 10) recommended_movie_list = [] for movie in recommended_movies: recommended_movie_list.append(movie[1]) return recommended_movie_list # return a list of movie id def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list # this function connects to imdb database to get info (including cover image) # did not make it to front end due to performance and latency issue # need future work for improvement def get_detail(self, movieid, imdb_id): m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = self.integration_folder + "Images/" + str(movieid) + ".jpg" urllib.urlretrieve(cover, path) return m # get a list of movie info given a list of movie ids def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) if info['title'] != 'unknown': info_list.append(info) if len(info_list) == 5: break return info_list # get movie info (title, direction, genres, rating, cast) from our rdd database def get_brief(self, movieid): info = {} info['movieid'] = movieid info['title'] = 'unknown' info['genres'] = 'unknown' info['rating'] = 0 #info['imdbid'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' m = self.movie_df.where(self.movie_df['movieId'] == movieid).first() if m is not None: info['title'] = m['name'] info['genres'] = m['genres'] if len(info['genres']) > 3: info['genres'] = info['genres'][0:3] d = self.detail_df.where(self.detail_df['movieId'] == movieid).first() if d is not None: info['director'] = d['director'] info['cast'] = d['cast'] r = self.rating_df.where(self.rating_df['movieId'] == movieid) # default rating to be 4.6 if r.count() == 0: info['rating'] = 4.6 else: avg = r.map(lambda row: row['rating']).reduce( lambda x, y: x + y) / r.count() info['rating'] = avg return info
SVD recommendation only for unknown movies ''' # Lets make things Verbose recsys.algorithm.VERBOSE = True # Loading the computed model svd = SVD(filename='movielens_small') svd.load_data(filename='ratings_small.csv', sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) svd.create_matrix() # Loading the movielens file of movies which has a mapping of movies to movie-id loop = True while (loop): ratings_file = open('ratings_small.csv', 'r+') movie_lens = open('movies.csv', 'r+') user_found = False movie_found = False USERID = int(input("Enter user id: ")) # Check if the user_id exists. Since currently we are using the small database, we need to check each and every field. # If using the complete database, just check if the number lies in the range. for rating_row in ratings_file: rating_item = rating_row.split(',') if (int(rating_item[0]) == USERID): user_found = True
class RecommendationSystem(): # To run on your own machine, you need to initialize with your datapath to the frontend folder def __init__(self, sc, datapath='/media/psf/Home/CS/GIT_HUB/Movie-Recommendation-Project/frontend/', rating_file='ratings_small.csv', complete_rating_file='ratings.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.sc = sc self.start = True self.rating_file = datapath+rating_file self.complete_rating_file = datapath+complete_rating_file self.movie_file = datapath+movie_file self.detail_file = datapath+detail_file self.integration_folder = datapath self.svd = SVD(filename=datapath+model) self.svd.load_data(filename=self.rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') # als stuff self.sqlContext = SQLContext(self.sc) self.movie_data = self.sc.textFile(self.movie_file) self.ratings_data = self.sc.textFile(self.complete_rating_file).map(lambda line: line.split(",")).map(lambda x: (int(x[0]), int(x[1]), float(x[2]))) self.als_model_path = datapath + 'Model_Collaborative_Filtering' self.als_model = MatrixFactorizationModel.load(sc, self.als_model_path) self.movie_df = self.sqlContext.read.load(datapath+'tables/movies') self.detail_df = self.sqlContext.read.load(datapath+'tables/detail') self.rating_df = self.sqlContext.read.load(datapath+'tables/ratings') # call this function to get all recommendations def get_all_recomm(self, userid, moviename): movieid = self.get_movie_id(moviename) # all recommendation algorithms return a list of movie ids recom1 = self.svd_recomm(userid, only_unknown=True) recom2 = self.svd_similar(movieid) recom3 = self.als_new(userid) #get info about the movie based on movie ids brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) # print to terminal for l1 in brief_info1: print l1 for l2 in brief_info2: print l2 for l3 in brief_info3: print l3 return [brief_info1, brief_info2, brief_info3] # get movie id based on movie name input def get_movie_id(self, moviename): r = self.movie_df.where(self.movie_df['name'].startswith(moviename)).first() # return movie id 1 if not found if r is None: return 1 return r['movieId'] # svd recommendation algorithm based on the user's rating history, set only_known to True for unseen movies def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None # output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list # svd recommendation algorithm based on similar movie def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if int(row_item[0]) == movieid: movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list # this ALS recommendation algorithm did not get to present to front end # future work is needed to improve this algorithm def als_recomm(self, userid): user_movie_ratings = [16, 24, 32, 47, 50, 110, 150, 161, 165, 204, 223, 256, 260, 261, 277] unrated_movies = self.movie_data.filter(lambda x: x[0] not in user_movie_ratings).map(lambda x: (userid, x[0])) recommended_movies_rdd = self.als_model.predictAll(unrated_movies) # Now we get a list of predictions for all the movies which user has not seen. We take only the top 10 predictions user_recommended_ratings_rdd = recommended_movies_rdd.map(lambda x: (x.product, x.rating)) movie_ID_with_ratings_RDD = self.ratings_data.map(lambda x: (x[1], x[2])).groupByKey() movie_ID_with_avg_ratings_RDD = movie_ID_with_ratings_RDD.map(get_counts_and_averages) movie_rating_counts_rdd = movie_ID_with_avg_ratings_RDD.map(lambda x: (x[0], x[1][0])) user_recommended_movies_ratings_count_rdd = (user_recommended_ratings_rdd.join(movie_rating_counts_rdd)).map(lambda l: (l[0], l[1][0], l[1][1])) recommended_movies_list = user_recommended_movies_ratings_count_rdd.filter(lambda l: l[2] >= 20).takeOrdered(20, key=lambda x: -x[1]) return recommended_movies_list # an ALS recommendation algorithm based on user rating history def als_new(self, userid): recommended_movies = self.als_model.recommendProducts(userid, 10) recommended_movie_list = [] for movie in recommended_movies: recommended_movie_list.append(movie[1]) return recommended_movie_list # return a list of movie id def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list # this function connects to imdb database to get info (including cover image) # did not make it to front end due to performance and latency issue # need future work for improvement def get_detail(self, movieid, imdb_id): m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = self.integration_folder + "Images/" + str(movieid) + ".jpg" urllib.urlretrieve(cover, path) return m # get a list of movie info given a list of movie ids def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) if info['title'] != 'unknown': info_list.append(info) if len(info_list) == 5: break return info_list # get movie info (title, direction, genres, rating, cast) from our rdd database def get_brief(self, movieid): info = {} info['movieid'] = movieid info['title'] = 'unknown' info['genres'] = 'unknown' info['rating'] = 0 #info['imdbid'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' m = self.movie_df.where(self.movie_df['movieId'] == movieid).first() if m is not None: info['title'] = m['name'] info['genres'] = m['genres'] if len(info['genres']) > 3: info['genres'] = info['genres'][0:3] d = self.detail_df.where(self.detail_df['movieId'] == movieid).first() if d is not None: info['director'] = d['director'] info['cast'] = d['cast'] r = self.rating_df.where(self.rating_df['movieId'] == movieid) # default rating to be 4.6 if r.count()==0: info['rating'] = 4.6 else: avg = r.map(lambda row:row['rating']).reduce(lambda x, y: x+y)/r.count() info['rating'] = avg return info
#This is the recommendation algorithm based on the SVD #This code can be run in real time but the model has to be pre-computed import recsys.algorithm from recsys.algorithm.factorize import SVD #Lets make things Verbose recsys.algorithm.VERBOSE = True #Loading the computed model svd = SVD(filename='movielens_small') svd.load_data(filename='ratings_small.csv', sep=',', format={'col':0, 'row':1, 'value':2, 'ids':int}) svd.create_matrix() #Loading the movielens file of movies which has a mapping of movies to movie-id loop = True while (loop): ratings_file = open('ratings_small.csv', 'r+') movie_lens = open('movies.csv', 'r+') user_found = False movie_found = False USERID = int(input("Enter user id: ")) #Check if the user_id exists. Since currently we are using the small database, we need to check each and every field. #If using the complete database, just check if the number lies in the range. for rating_row in ratings_file: rating_item = rating_row.split(',') if (int(rating_item[0]) == USERID): user_found = True break if (movie_found): for movie_row in movie_lens:
class RecommendationSystem(): #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={ 'col': 0, 'row': 1, 'value': 2, 'ids': int }) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') def get_all_recomm(self, userid, movieid): recom1 = self.svd_recomm(userid, only_unknown=False) recom2 = self.svd_recomm(userid, only_unknown=True) recom3 = self.svd_similar(movieid) brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) return [brief_info1, brief_info2, brief_info3] def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None #output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if (int(row_item[0]) == movieid): movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list def get_detail(self, imdb_id): #print type(imdb_id) m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = "Images/" + str(imdb_id) + ".jpg" urllib.urlretrieve(cover, path) return m def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) info_list.append(info) return info_list def get_brief(self, movieid): info = {} info['title'] = 'unknown' info['genre'] = 'unknown' info['rating'] = 0 info['imdb_id'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' movies = open(self.movie_file, 'r') for m in movies: row_item = m.split(',') if int(row_item[0]) == movieid: info['title'] = str(row_item[1].strip()) info['genre'] = str(row_item[2].strip()).split('|') break movies.close() ratings = open(self.rating_file, 'r') for r in ratings: row_item = r.split(',') if int(row_item[1]) == movieid: info['rating'] = float(row_item[2].strip()) break ratings.close() details = open(self.detail_file, 'r') #details = codecs.open(self.detail_file, 'r', 'utf-8') for d in details: row_item = d.split(',') if int(row_item[0]) == movieid: #print 'found!' info['imdb_id'] = int(row_item[1].strip()) info['director'] = str(row_item[3].strip()) info['cast'] = str(row_item[4].strip()).split('|') break details.close() return info
class RecommendationSystem(): #def __init__(self, spark_context, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): def __init__(self, rating_file='ratings_small.csv', movie_file='movies.csv', detail_file='modified.csv', model='movielens_small'): self.start = True self.rating_file = rating_file self.movie_file = movie_file self.detail_file = detail_file self.svd = SVD(filename=model) self.svd.load_data(filename=rating_file, sep=',', format={'col': 0, 'row': 1, 'value': 2, 'ids': int}) self.svd.create_matrix() self.ia = imdb.IMDb(accessSystem='http') def get_all_recomm(self, userid, movieid): recom1 = self.svd_recomm(userid, only_unknown=False) recom2 = self.svd_recomm(userid, only_unknown=True) recom3 = self.svd_similar(movieid) brief_info1 = self.get_brief_list(recom1) brief_info2 = self.get_brief_list(recom2) brief_info3 = self.get_brief_list(recom3) return [brief_info1, brief_info2, brief_info3] def svd_recomm(self, userid, only_unknown): user_found = False ratings = open(self.rating_file, 'r') for rating_row in ratings: rating_item = rating_row.split(',') if int(rating_item[0]) == userid: user_found = True break ratings.close() if not user_found: return None #output format: (movieid, similarity value) if only_unknown: similar_list = self.svd.recommend(userid, n=10, only_unknowns=True, is_row=True) else: similar_list = self.svd.recommend(userid, n=10, only_unknowns=False, is_row=False) movieid_list = self.get_id_list(similar_list) return movieid_list def svd_similar(self, movieid): movie_found = False movies = open(self.movie_file, 'r') for movie_row in movies: row_item = movie_row.split(',') if (int(row_item[0]) == movieid): movie_found = True break movies.close() if not movie_found: return None similar_list = self.svd.similar(movieid) movieid_list = self.get_id_list(similar_list) return movieid_list def get_id_list(self, l): movieid_list = [] for s in l: movieid_list.append(s[0]) return movieid_list def get_detail(self, imdb_id): #print type(imdb_id) m = self.ia.get_movie(str(imdb_id)) cover = m.get('cover url') if cover: path = "Images/" + str(imdb_id) + ".jpg" urllib.urlretrieve(cover, path) return m def get_brief_list(self, movieList): info_list = [] for m in movieList: info = self.get_brief(m) info_list.append(info) return info_list def get_brief(self, movieid): info = {} info['title'] = 'unknown' info['genre'] = 'unknown' info['rating'] = 0 info['imdb_id'] = 1 info['director'] = 'unknown' info['cast'] = 'unknown' movies = open(self.movie_file, 'r') for m in movies: row_item = m.split(',') if int(row_item[0]) == movieid: info['title'] = str(row_item[1].strip()) info['genre'] = str(row_item[2].strip()).split('|') break movies.close() ratings = open(self.rating_file, 'r') for r in ratings: row_item = r.split(',') if int(row_item[1]) == movieid: info['rating'] = float(row_item[2].strip()) break ratings.close() details = open(self.detail_file, 'r') #details = codecs.open(self.detail_file, 'r', 'utf-8') for d in details: row_item = d.split(',') if int(row_item[0]) == movieid: #print 'found!' info['imdb_id'] = int(row_item[1].strip()) info['director'] = str(row_item[3].strip()) info['cast'] = str(row_item[4].strip()).split('|') break details.close() return info