def get_movie_meta_for(movie_ids: List[int]) -> List[Dict]: # if single movie, pack into list if isinstance(movie_ids, int): movie_ids = [movie_ids] movie_ids = filter(lambda x: x is not None, movie_ids) meta: pd.DataFrame = Data.movie_meta() try: # filter metadata meta = meta.loc[movie_ids] except KeyError as e: raise MovieNotFoundException(e.args) # fetch metadata for the movies, convert to dictionary # orientation='records' results in [{'col1': 'val1', 'col2': 'val2'}, {'col1': 'val1', ..}] meta_dict: List[Dict] = meta.to_dict(orient='records') for item in meta_dict: for col in [ Column.actors, Column.genres, Column.keywords, Column.directors ]: if not pd.isnull(item[col.value]): item[col.value] = eval(item[col.value]) add_poster_urls(meta_dict) return meta_dict
def get_imdb_id(movielens_id: int) -> int: movies = Data.movie_meta() if movielens_id not in movies.index: raise MovieNotFoundException() movie = movies.loc[movielens_id] return movie[Column.imdb_id.value]
def build_index(cls): if cls.ix is None: cls.init() # automatically calls iw.commit() iw = cls.ix.writer() for movie_id, movie in Data.movie_meta().iterrows(): # extract fields fields: Dict = { 'movie_id': movie_id, 'title': movie[Column.title.value], # 'tagline': movie[Column.tagline.value], # 'summary': movie[Column.summary.value], # 'keywords': movie[Column.keywords.value], # 'popularity': movie[Column.num_ratings.value], # 'genres': movie[Column.genres.value], } # filter empty values (inserting fails for np.nan values) fields = { key: val for key, val in fields.items() if val is not None and val is not np.nan and val != '' } # insert into index iw.update_document(**fields) iw.commit(optimize=True)
def get_year_relevance(movie_id:int, n:int=0): release_years= Data.movie_meta()[Column.release_year.value] movie_year=release_years.loc[movie_id] release_years = release_years.subtract(movie_year) release_years = release_years.abs() release_years=1-release_years.div(release_years.max()) return release_years.drop(movie_id)
def get_similarities_for(cls, movie_id: int, colname: str): # get similarity matrix (calculate if necessary) sim_matrix = cls.calculate_similarities(colname) # get absolute index of movie index = Data.movie_meta().index.get_loc(movie_id) # get similarities for this movie # use .toarray() to convert from sparse matrix # use [0] to convert "matrix" with only one row to one-dimensional array similarities = sim_matrix[index].toarray()[0] # put into pandas Series # use index=... to apply original index series = pd.Series(index=Data.movie_meta().index, data=similarities) return series.drop(movie_id)
def __call__(self, movie_id: int, n: int = 5): meta = Data.movie_meta() collection = meta[get_collection_mask(movie_id, meta)].index.values results: pd.Series = self.method(movie_id, n + 10) results = results.drop(collection, errors='ignore') return results
def get_normalized_popularity(): # used for popularity bias popularity = Data.movie_meta()[Column.num_ratings.value] # apply root reduce linearity # (if movie A has double the ratings of movie B, its popularity should only be slightly higher) popularity **= (1 / 10) # normalize popularity /= popularity.max() return popularity
def get_movies_with_similar_genres(movie_id: int, n: int = 5, popularity_bias: bool = False , user_bias: bool = False, movies: pd.DataFrame = None): # Get all movies and split them into the base movie and the rest if n is None: n = 5 # Use the preferred movie df if movies is None: all_movies = Data.movie_meta()[Column.genres.value] else: all_movies = movies[Column.genres.value] # get the base out of the df and remove it from the rest base_genres = eval(all_movies.loc[movie_id]) all_movies = all_movies.drop(movie_id) # count similar genres all_movies = all_movies.apply( lambda row: count_elements_in_set(row, base_genres) ) # remove all movies which have no genre in common filtered_movies_sum = all_movies[all_movies > 0] # if user_bias is true if user_bias: # reduce the amount of movies to n * 10 movies top_n_mul_ten = filtered_movies_sum.nlargest(n * 10) ratings = Data.ratings() # group by movie ratings_grouped = ratings.groupby(str(Column.movie_id)) # calculate mean rating and number of ratings for each movie # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) ) measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])[str(Column.rating)] # merging mean, count and genre sum into one DataFrame measures_movies = pd.merge(measures, pd.DataFrame(top_n_mul_ten), left_index=True, right_index=True) if popularity_bias: # give more weight to the number of ratings (~popularity) # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings) # and multiplying the count back in # additionally multiply the genre back in # to prevent good rated movies with little correlation to the genres results = measures_movies.eval('(mean ** 3) * count * genres') else: # multiply genre to prevent good rated movies with little correlation to the genres results = measures_movies.eval('mean * genres') else: results = filtered_movies_sum # breakpoint() return results
def calculate_similarities(cls, colname: str, overwrite_existing: bool = False): if colname not in cls.similarity_matrices or overwrite_existing: # calculate tf_idf for column tfidf_matrix = cls.tf_idf.fit_transform( Data.movie_meta()[colname].fillna('')) # calculate similarities between movies # use dense_output=False (results in sparse matrix) to reduce memory usage cls.similarity_matrices[colname] = linear_kernel( tfidf_matrix, tfidf_matrix, dense_output=False) return cls.similarity_matrices[colname]
def tmdb_reference(movie_id: int, n: int = 5): movie = Data.movie_meta().loc[movie_id] # get list from string representation similar_tmdb = eval(movie[Column.tmdb_similar.value]) # get movielens id from tmdb_id similar = map(lambda tmdb_id: get_movielens_id(tmdb_id=tmdb_id), similar_tmdb) # return with artificial decreasing score return pd.Series({ item: -index for index, item in enumerate(similar) if item is not None })
def get_movielens_id(tmdb_id: int = None, imdb_id: int = None) -> int: movies: pd.DataFrame = Data.movie_meta() if tmdb_id is not None: movie = movies.query(f'{Column.tmdb_id.value} == {tmdb_id}') elif imdb_id is not None: movie = movies.query(f'{Column.imdb_id.value} == {imdb_id}') else: return None if movie.empty: return None return movie.index[0]
def _recommend_movies(movie_id: int, n: int, method: Method) -> List[Dict]: if movie_id not in Data.movie_meta().index: raise MovieNotFoundException # start with the movie itself movies: List[int] = [movie_id] # calculate similarities scores: Series = method(movie_id) # and filter out any movies that were recommended recently scores = History.filter(scores) if method == Method.reference or method == Method.sequels: n = 20 # movies = [base_movie, ...recommendations] movies.extend(scores.nlargest(n).index) # add recommendations for movies History.append(movies) return get_movie_meta_for(movies)
def get_collection(movie_id: int, df: pd.DataFrame = None, include_base_movie: bool = True, start_from_base_movie: bool = False, wrap_to_start: bool = False) -> pd.DataFrame: """ Get movies from a collection. :param movie_id: a movie that is in a collection :param df: the pandas DataFrame to search :param include_base_movie: whether to include movie_id itself in the result :param start_from_base_movie: whether to split the result and start at movie_id :param wrap_to_start: if start_from_base_movie: at the end of the collection, wrap over to the start and include the prequels :return: a DataFrame containing the movies in the collection """ if df is None: df = Data.movie_meta() # select movies that are in collection m = df[get_collection_mask(movie_id, df)] # sort by release year m = m.sort_values(by=Column.release_date.value) if not include_base_movie: m = m.drop(movie_id) if start_from_base_movie: # split dataframe at base_movie sequels = m.loc[movie_id:] prequels = m.loc[:movie_id - 1] if wrap_to_start: # reverse order and join again m = pd.concat([sequels, prequels]) else: # just return the movies starting with the base movie m = sequels return m
def search(cls, query_text: str, n: int, add_posters: bool = True): # this method applies a popularity bias to search results # as they need to be resorted, more search terms should be provided than necessary, # to be able to recover popular results that have rather low scores results = cls._search(query_text, n + 25) # encapsulate in pandas.Series for further operations scores = pd.Series(results, name='score') # perform a (right outer) join to connect the search results to the metadata df = Data.movie_meta().join(scores, how='right') # calculate the weighted score by raising it to some power # in order for the popularity to not overpower the score completely # and multiply with the number of ratings (the popularity) df.eval(f'weighted = score**16 * {Column.num_ratings.value}', inplace=True) # extract the n best results and export as list movie_ids = list(df.nlargest(n, 'weighted').index) # fetch metadata meta = get_movie_meta_for(movie_ids) return meta
def get_poster_omdb_ml(cls, movielens_id: int) -> str: from util.data import Data, Column imdb_movie_id = Data.movie_meta().at[movielens_id, Column.imdb_id.value] return cls.get_poster_omdb_imdb(imdb_movie_id=imdb_movie_id)
def sample(movie_id: int, n: int = 5) -> pd.Series: # just return the movies with default ordering return -Data.movie_meta()['movielens_id']
def recommend_movie_meta(movie_id: int, n: int = 5, popularity_bias: bool = False, user_bias: bool = False): # Get movie_meta data and set the index on movie_id movies_meta = Data.movie_meta() # Get the meta data from the base movie base_movie_meta = movies_meta.loc[movie_id, :] # filtered movies based on color and adult filtered_movies = movies_meta.query('tmdb_adult == {}'.format( base_movie_meta['tmdb_adult'])) filtered_movies = filtered_movies.query('imdb_color == "{}"'.format( base_movie_meta['imdb_color'])) # filtered movies based on genre movies = genre_filter.get_movies_with_similar_genres( movie_id, n, movies=filtered_movies) # merge the number of similar genres back to the main df merged_movies = pd.merge(pd.DataFrame(movies), filtered_movies, left_index=True, right_index=True) merged_movies = merged_movies.rename( columns={"{}_x".format(Column.genres.value): Column.genres.value}) # preparing data for the score calculation # count similar items in the columns or calculate the difference merged_movies = calculate_column(merged_movies, base_movie_meta, 'actors') merged_movies = calculate_column(merged_movies, base_movie_meta, 'directors') merged_movies = calculate_column(merged_movies, base_movie_meta, 'tmdb_keywords') merged_movies = calculate_column(merged_movies, base_movie_meta, 'tmdb_production_countries') merged_movies = calculate_column(merged_movies, base_movie_meta, 'release_year', year=True) # score calculation score = compute_score(merged_movies) # calculate the ranking with the avg user rating if user_bias: # get the ratings/results like in recommend_movie ratings = Data.ratings().query('movie_id != %s' % movie_id) merged_ratings = pd.merge(ratings, merged_movies, left_on='movie_id', right_index=True) # group by movie ratings_grouped = merged_ratings.groupby('movie_id') # calculate mean rating and number of ratings for each movie # (select rating to remove first level of column index. before: (rating: (mean, count)), after: (mean, count) ) measures: pd.DataFrame = ratings_grouped.agg(['mean', 'count'])['rating'] # merging mean, count and genre sum into one DataFrame measures_movies = pd.merge(measures, pd.DataFrame(score), left_index=True, right_index=True) measures_movies = measures_movies.rename(columns={0: 'score'}) # additionally calculate it with the popularity of the movies if popularity_bias: # give more weight to the number of ratings (~popularity) # by raising the avg ratings to some power (to preserve some notion of good vs. bad ratings) # and multiplying the count back in # additionally multiply the genre back in # to prevent good rated movies with little correlation to the genres results = measures_movies.eval('((mean * score) ** 3) * count') else: # multiply genre to prevent good rated movies with little correlation to the genres results = measures_movies.eval('mean * score') else: results = score return results
def get_genre_as_lists(): return Data.movie_meta()[Column.genres.value].map(eval)
def directors_as_lists(): return Data.movie_meta()[Column.directors.value].map(eval)
def actors_as_lists(): # since eval (convert string representation to object) is costly time-wise, cache results return Data.movie_meta()[Column.actors.value].map(eval)
def drop_collection(movie_id: int, df: pd.DataFrame = None) -> pd.DataFrame: if df is None: df = Data.movie_meta() return df[~get_collection_mask(movie_id, df)]