def get_movies_with_similar_genres(movie_id: int, n: int = 5, popularity_bias: bool = False , user_bias: bool = False, movies: pd.DataFrame = None): # Get all movies and split them into the base movie and the rest if n is None: n = 5 # Use the preferred movie df if movies is None: all_movies = Data.movie_meta()[Column.genres.value] else: all_movies = movies[Column.genres.value] # get the base out of the df and remove it from the rest base_genres = eval(all_movies.loc[movie_id]) all_movies = all_movies.drop(movie_id) # count similar genres all_movies = all_movies.apply( lambda row: count_elements_in_set(row, base_genres) ) # remove all movies which have no genre in common filtered_movies_sum = all_movies[all_movies > 0] # if user_bias is true if user_bias: # reduce the amount of movies to n * 10 movies top_n_mul_ten = filtered_movies_sum.nlargest(n * 10) ratings = Data.ratings() # group by movie ratings_grouped = ratings.groupby(str(Column.movie_id)) # calculate mean rating and number of ratings for each movie # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) ) measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])[str(Column.rating)] # merging mean, count and genre sum into one DataFrame measures_movies = pd.merge(measures, pd.DataFrame(top_n_mul_ten), left_index=True, right_index=True) if popularity_bias: # give more weight to the number of ratings (~popularity) # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings) # and multiplying the count back in # additionally multiply the genre back in # to prevent good rated movies with little correlation to the genres results = measures_movies.eval('(mean ** 3) * count * genres') else: # multiply genre to prevent good rated movies with little correlation to the genres results = measures_movies.eval('mean * genres') else: results = filtered_movies_sum # breakpoint() return results
def recommend_movies(movie_id: int, n: int = 5, filter_below_avg_ratings: bool = False, popularity_bias: bool = False) \ -> List[int]: ratings = Data.ratings() # first get the ratings for the base movie ratings_of_base_movie = ratings.query('movie_id == %s' % movie_id) # check if there are reviews for this movie if ratings_of_base_movie.empty: raise MissingDataException('no ratings for movie_id %s' % movie_id) if filter_below_avg_ratings: # of those, select the above average ratings avg_rating = ratings_of_base_movie['rating'].mean() # query is actually faster than the python subscription syntax ( users[users['rating'] >= avg] ) ratings_of_base_movie = ratings_of_base_movie.query('rating >= %f' % avg_rating) # to get ratings from all the users that have rated/liked the base movie, # perform a (left outer) join on all the ratings on user_id relevant_movies = ratings_of_base_movie.join(ratings, on='user_id', lsuffix='_L') # remove the columns that were duplicated as result of the join relevant_movies = relevant_movies[['movie_id', 'rating']] # remove the base movie from the results relevant_movies = relevant_movies.query('movie_id != %s' % movie_id) if relevant_movies.empty: raise MissingDataException( 'no other ratings from users that rated movie_id %s' % movie_id) # group by movie relevant_movie_groups = relevant_movies.groupby('movie_id') # calculate mean rating and number of ratings for each movie # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) ) measures: pd.DataFrame = relevant_movie_groups.agg(['mean', 'count'])['rating'] if popularity_bias: # give more weight to the number of ratings (~popularity) # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings) # and multiplying the count back in results = measures.eval('(mean ** 3) * count') else: results = measures['mean'] return results
def recommend_movie_meta(movie_id: int, n: int = 5, popularity_bias: bool = False, user_bias: bool = False): # Get movie_meta data and set the index on movie_id movies_meta = Data.movie_meta() # Get the meta data from the base movie base_movie_meta = movies_meta.loc[movie_id, :] # filtered movies based on color and adult filtered_movies = movies_meta.query('tmdb_adult == {}'.format( base_movie_meta['tmdb_adult'])) filtered_movies = filtered_movies.query('imdb_color == "{}"'.format( base_movie_meta['imdb_color'])) # filtered movies based on genre movies = genre_filter.get_movies_with_similar_genres( movie_id, n, movies=filtered_movies) # merge the number of similar genres back to the main df merged_movies = pd.merge(pd.DataFrame(movies), filtered_movies, left_index=True, right_index=True) merged_movies = merged_movies.rename( columns={"{}_x".format(Column.genres.value): Column.genres.value}) # preparing data for the score calculation # count similar items in the columns or calculate the difference merged_movies = calculate_column(merged_movies, base_movie_meta, 'actors') merged_movies = calculate_column(merged_movies, base_movie_meta, 'directors') merged_movies = calculate_column(merged_movies, base_movie_meta, 'tmdb_keywords') merged_movies = calculate_column(merged_movies, base_movie_meta, 'tmdb_production_countries') merged_movies = calculate_column(merged_movies, base_movie_meta, 'release_year', year=True) # score calculation score = compute_score(merged_movies) # calculate the ranking with the avg user rating if user_bias: # get the ratings/results like in recommend_movie ratings = Data.ratings().query('movie_id != %s' % movie_id) merged_ratings = pd.merge(ratings, merged_movies, left_on='movie_id', right_index=True) # group by movie ratings_grouped = merged_ratings.groupby('movie_id') # calculate mean rating and number of ratings for each movie # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) ) measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])['rating'] # merging mean, count and genre sum into one DataFrame measures_movies = pd.merge(measures, pd.DataFrame(score), left_index=True, right_index=True) measures_movies = measures_movies.rename(columns={0: 'score'}) # additionally calculate it with the popularity of the movies if popularity_bias: # give more weight to the number of ratings (~popularity) # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings) # and multiplying the count back in # additionally multiply the genre back in # to prevent good rated movies with little correlation to the genres results = measures_movies.eval('((mean * score) ** 3) * count') else: # multiply genre to prevent good rated movies with little correlation to the genres results = measures_movies.eval('mean * score') else: results = score return results