Beispiel #1
0
def get_movies_with_similar_genres(movie_id: int, n: int = 5, popularity_bias: bool = False
                                   , user_bias: bool = False, movies: pd.DataFrame = None):
    # Get all movies and split them into the base movie and the rest

    if n is None:
        n = 5

    # Use the preferred movie df
    if movies is None:
        all_movies = Data.movie_meta()[Column.genres.value]
    else:
        all_movies = movies[Column.genres.value]

    # get the base out of the df and remove it from the rest
    base_genres = eval(all_movies.loc[movie_id])
    all_movies = all_movies.drop(movie_id)

    # count similar genres
    all_movies = all_movies.apply(
        lambda row: count_elements_in_set(row, base_genres)
    )
    # remove all movies which have no genre in common
    filtered_movies_sum = all_movies[all_movies > 0]

    # if user_bias is true
    if user_bias:
        # reduce the amount of movies to n * 10 movies
        top_n_mul_ten = filtered_movies_sum.nlargest(n * 10)
        ratings = Data.ratings()

        # group by movie
        ratings_grouped = ratings.groupby(str(Column.movie_id))
        # calculate mean rating and number of ratings for each movie
        # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) )
        measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])[str(Column.rating)]

        # merging mean, count and genre sum into one DataFrame
        measures_movies = pd.merge(measures, pd.DataFrame(top_n_mul_ten), left_index=True, right_index=True)

        if popularity_bias:
            # give more weight to the number of ratings (~popularity)
            # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings)
            # and multiplying the count back in
            # additionally multiply the genre back in
            # to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('(mean ** 3) * count * genres')
        else:
            # multiply genre to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('mean * genres')
    else:
        results = filtered_movies_sum

    # breakpoint()
    return results
def recommend_movies(movie_id: int, n: int = 5, filter_below_avg_ratings: bool = False, popularity_bias: bool = False) \
        -> List[int]:
    ratings = Data.ratings()

    # first get the ratings for the base movie
    ratings_of_base_movie = ratings.query('movie_id == %s' % movie_id)

    # check if there are reviews for this movie
    if ratings_of_base_movie.empty:
        raise MissingDataException('no ratings for movie_id %s' % movie_id)

    if filter_below_avg_ratings:
        # of those, select the above average ratings
        avg_rating = ratings_of_base_movie['rating'].mean()
        # query is actually faster than the python subscription syntax ( users[users['rating'] >= avg] )
        ratings_of_base_movie = ratings_of_base_movie.query('rating >= %f' %
                                                            avg_rating)

    # to get ratings from all the users that have rated/liked the base movie,
    # perform a (left outer) join on all the ratings on user_id
    relevant_movies = ratings_of_base_movie.join(ratings,
                                                 on='user_id',
                                                 lsuffix='_L')
    # remove the columns that were duplicated as result of the join
    relevant_movies = relevant_movies[['movie_id', 'rating']]
    # remove the base movie from the results
    relevant_movies = relevant_movies.query('movie_id != %s' % movie_id)

    if relevant_movies.empty:
        raise MissingDataException(
            'no other ratings from users that rated movie_id %s' % movie_id)

    # group by movie
    relevant_movie_groups = relevant_movies.groupby('movie_id')
    # calculate mean rating and number of ratings for each movie
    # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) )
    measures: pd.DataFrame = relevant_movie_groups.agg(['mean',
                                                        'count'])['rating']

    if popularity_bias:
        # give more weight to the number of ratings (~popularity)
        # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings)
        # and multiplying the count back in
        results = measures.eval('(mean ** 3) * count')
    else:
        results = measures['mean']

    return results
def recommend_movie_meta(movie_id: int,
                         n: int = 5,
                         popularity_bias: bool = False,
                         user_bias: bool = False):
    # Get movie_meta data and set the index on movie_id
    movies_meta = Data.movie_meta()
    # Get the meta data from the base movie
    base_movie_meta = movies_meta.loc[movie_id, :]

    # filtered movies based on color and adult
    filtered_movies = movies_meta.query('tmdb_adult == {}'.format(
        base_movie_meta['tmdb_adult']))
    filtered_movies = filtered_movies.query('imdb_color == "{}"'.format(
        base_movie_meta['imdb_color']))

    # filtered movies based on genre
    movies = genre_filter.get_movies_with_similar_genres(
        movie_id, n, movies=filtered_movies)

    # merge the number of similar genres back to the main df
    merged_movies = pd.merge(pd.DataFrame(movies),
                             filtered_movies,
                             left_index=True,
                             right_index=True)
    merged_movies = merged_movies.rename(
        columns={"{}_x".format(Column.genres.value): Column.genres.value})

    # preparing data for the score calculation
    # count similar items in the columns or calculate the difference
    merged_movies = calculate_column(merged_movies, base_movie_meta, 'actors')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'directors')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'tmdb_keywords')
    merged_movies = calculate_column(merged_movies, base_movie_meta,
                                     'tmdb_production_countries')
    merged_movies = calculate_column(merged_movies,
                                     base_movie_meta,
                                     'release_year',
                                     year=True)

    # score calculation
    score = compute_score(merged_movies)

    # calculate the ranking with the avg user rating
    if user_bias:
        # get the ratings/results like in recommend_movie
        ratings = Data.ratings().query('movie_id != %s' % movie_id)
        merged_ratings = pd.merge(ratings,
                                  merged_movies,
                                  left_on='movie_id',
                                  right_index=True)

        # group by movie
        ratings_grouped = merged_ratings.groupby('movie_id')
        # calculate mean rating and number of ratings for each movie
        # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) )
        measures: pd.DataFrame = ratings_grouped.agg(['mean',
                                                      'count'])['rating']

        # merging mean, count and genre sum into one DataFrame
        measures_movies = pd.merge(measures,
                                   pd.DataFrame(score),
                                   left_index=True,
                                   right_index=True)
        measures_movies = measures_movies.rename(columns={0: 'score'})

        # additionally calculate it with the popularity of the movies
        if popularity_bias:
            # give more weight to the number of ratings (~popularity)
            # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings)
            # and multiplying the count back in
            # additionally multiply the genre back in
            # to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('((mean * score) ** 3) * count')
        else:
            # multiply genre to prevent good rated movies with little correlation to the genres
            results = measures_movies.eval('mean * score')

    else:
        results = score

    return results