def data_initialization(self): logging.debug( f'[{MovieRecommendationItemRating.data_initialization.__name__}] - start of function' ) df_movies, df_ratings = self.read_files() combine_movie_rating = pd.merge(df_ratings, df_movies, on='movieId') combine_movie_rating = combine_movie_rating.dropna(axis=0, subset=['title']) movie_ratingCount = (combine_movie_rating.groupby( by=['title'])['rating'].count().reset_index().rename( columns={'rating': 'totalRatingCount'})[[ 'title', 'totalRatingCount' ]]) rating_with_totalRatingCount = combine_movie_rating.merge( movie_ratingCount, left_on='title', right_on='title', how='left') user_rating = rating_with_totalRatingCount.drop_duplicates( ['userId', 'title']) movie_user_rating_pivot = pd.pivot_table(user_rating, index='userId', columns='title', values='rating').fillna(0) X = movie_user_rating_pivot.values.T # calculating correlation matrix i.e.model SVD = TruncatedSVD(n_components=12, random_state=17) matrix = SVD.fit_transform(X) corr = np.corrcoef(matrix) movie_title = movie_user_rating_pivot.columns return movie_title, corr, df_movies
def map_string_to_movie(selection_query): """ It is necessary to match a string back to its original movie object so the id can be used for further provision of information @param selection_query: Matches the string of an existing movie back to the id @return: A movie object with the ID and title of the movie """ # Split the provided string that was used for the matching # It contains the score which is not needed anymore at this point # split_movie_title = selectionQuery.split('\'') # Extract the actual title # movie_title = split_movie_title[1] logging.debug( f'[{map_string_to_movie.__name__}] - start of function with selection query: {selection_query}' ) movie_title = selection_query PATH = os.path.join(MOVIELENS_ROOT, 'movies.csv') df_movies: pd.DataFrame = pd.read_csv(PATH, encoding="UTF-8", usecols=[MOVIE_ID, TITLE], dtype={ MOVIE_ID: 'int32', TITLE: 'str' }) # Map the title back to the original movie to use its id movie_object = df_movies.loc[df_movies[TITLE] == movie_title] logging.debug( f'[{map_string_to_movie.__name__}] - Movie object: {movie_object}') return movie_object
def get_similar_movies_based_on_genre(self, input_movie_title): """ Returns a collection of similar movie titles based on the genre @param input_movie_title: the reference movie title for the recommendation @return: a collection of similar movies based on genre """ logging.debug( f'[{self.get_similar_movies_based_on_genre.__name__}] - ' f'start of function with movie title: {input_movie_title}') cosine_sim, movie_content_df_temp = self.data_initialization( 'resources/movies.csv') # create a series of the movie id and title indices = pd.Series(movie_content_df_temp.index, movie_content_df_temp['title']) movie_index = indices[input_movie_title] sim_scores = list(enumerate(cosine_sim[movie_index])) # Sort the movies based on the similarity scores sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Get the scores of the 10 most similar movies sim_scores = sim_scores[1:6] # Get the movie indices movie_indices = [i[0] for i in sim_scores] similar_movies = pd.DataFrame( movie_content_df_temp[['title']].iloc[movie_indices]) similar_movies = similar_movies.reset_index() # similar_movies['score'] = movie_sim_scores similar_movies = similar_movies.to_dict() return similar_movies
def metadata_recommender_with_keywords(self, movie_id): """ Metadata recommender based on keywords and genres :param movie_id: the id of the movie :return: list of movie ids, may be [], maximum length is 5 """ logging.debug( f'[{self.metadata_recommender_with_keywords.__name__}] - start function with movie id: {movie_id}' ) if movie_id not in self.movie_metadata: return [] genres = self.movie_metadata[movie_id][GENRES_COL] keywords = self.movie_metadata[movie_id][KEYWORDS_COL] movie_scores_ref = list() Recommender.add_score_to_list(genres, 2, movie_scores_ref) Recommender.add_score_to_list(keywords, 10, movie_scores_ref) movie_points_jaccard = dict() for key, movie in self.movie_metadata.items(): if key == movie_id: continue movie_scores = list() Recommender.match_with_bias(movie[GENRES_COL], genres, 2, 0, movie_scores) Recommender.match_with_bias(movie[KEYWORDS_COL], keywords, 10, 5, movie_scores) movie_points_jaccard[key] = float( sm.jaccard_similarity(movie_scores_ref, movie_scores)) recommendation = sorted(movie_points_jaccard, key=lambda x: movie_points_jaccard[x], reverse=True) return recommendation[:5]
def get_similar_movies_based_on_tags(self, input_movie_title): """ Matches the given movie title with the 10 most similar movies based on the content tags @param input_movie_title: the movie title for the recommendation @return: a collection of the 10 most similar movie titles based on tags """ logging.debug( f'[{self.get_similar_movies_based_on_tags.__name__}] - start of function with title <{input_movie_title}>' ) cosine_sim, movie_content_df_temp = self.read_model_content_data() # create a series of the movie id and title indices = pd.Series(movie_content_df_temp.index, movie_content_df_temp['title']) movie_index = indices[input_movie_title] sim_scores = list(enumerate(cosine_sim[movie_index])) # Sort the movies based on the similarity scores sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # Get the scores of the 5 most similar movies sim_scores = sim_scores[1:6] # Get the movie indices movie_indices = [i[0] for i in sim_scores] similar_movies = pd.DataFrame( movie_content_df_temp[['title']].iloc[movie_indices]) similar_movies = similar_movies.reset_index() # similar_movies['score'] = movie_sim_scores return similar_movies.to_dict()
def _get_views_dict(movie_collection: list, movie_dict: dict) -> dict: logging.debug( f'[{_get_views_dict.__name__}] - start to transform movie collection to dictionary' ) if type(movie_collection) is list: _get_movie_dict(movie_collection, movie_dict) elif type(movie_collection) is dict: _df_to_movie_dict(movie_collection, movie_dict) return movie_dict
def offline_initialization(self, movies_data, tags_data): logging.debug( f'[{self.offline_initialization.__name__}] - ' f'start of function with movies_data <{movies_data}> and tags_data <{tags_data}>' ) # reading the movies dataset movie_list = pd.read_csv(movies_data, encoding="Latin1") tag_list = pd.read_csv(tags_data, encoding="Latin1") movie_tags_list = "" for index, row in tag_list.iterrows(): movie_tags_list += row.tag + "|" # split the string into a list of values tags_list_split = movie_tags_list.split('|') # de-duplicate values new_list = list(set(tags_list_split)) # remove the value that is blank new_list.remove('') df = pd.DataFrame(columns={'movieId', 'tags'}) for row in movie_list.iterrows(): movie_id = row[1]['movieId'] df_temp = tag_list.loc[tag_list['movieId'] == movie_id] tag_lst = "" for tag in df_temp.iterrows(): tag_lst = tag_lst + str(tag[1]['tag']) + '|' df = df.append({ 'movieId': movie_id, 'tags': tag_lst }, ignore_index=True) combine_movie_tags = pd.merge(movie_list, df, on='movieId') # Enriching the movies dataset by adding the various genres columns. movies_with_tags = combine_movie_tags.copy() # selection of 5000 tag features to prepare data for model for tg in new_list[:500]: movies_with_tags[tg] = movies_with_tags.apply( lambda _: int(tg in _.tags), axis=1) # Getting the movies list with only genres like Musical and other such columns movie_content_df_temp = movies_with_tags.copy() movie_content_df_temp.set_index('movieId') movie_content_df = movie_content_df_temp.drop( columns=['movieId', 'title', 'genres', 'tags']) movie_content_df = movie_content_df.values print(movie_content_df) # Compute the cosine similarity matrix cosine_sim = linear_kernel(movie_content_df, movie_content_df) # write model for offline initialization np.savetxt('model.txt', cosine_sim) # write Movie contents for runtime recommendations movie_content_df_temp.to_csv('movie_content.csv', index=True, header=True)
def _remove_year_from_title(movie_title: str) -> str: """ Removes the release year of the movie title given as parameter @param movie_title: the movie title, which will be modified @return: the movie title without the release year """ logging.debug(f'[{_remove_year_from_title.__name__}] - start of function') year_regex = re.compile(YEAR_PATTERN) return_string: str = year_regex.subn('', movie_title)[0] logging.debug(f'[{_remove_year_from_title.__name__}] - return string: {return_string}') return return_string
def _get_json_response(image_url: str) -> str: """ Transforms the image_url in parameter to a json response @param image_url: the image url to be transformed @return: the json response of the given url (Note: can be None) """ logging.debug(f'[{_get_json_response.__name__}] - start of function') response = requests.get(image_url) json_response = response.json() # poster url is in the json_response if POSTER in json_response: return json_response[POSTER] else: return None
def metadata_recommender(self, movie_id: int, bias=15): """ Metadata recommender based on genres, language, actors, directors and keywords :param movie_id: the id of the movie :param bias: the scoring bias :return: list of movie ids, may be [], maximum length is 5 """ logging.debug( f'[{self.metadata_recommender.__name__}] - start function with movie id: {movie_id}' ) if movie_id not in self.movie_metadata: return [] genres = self.movie_metadata[movie_id][GENRES_COL] languages = self.movie_metadata[movie_id][LANGUAGES_COL] actors = self.movie_metadata[movie_id][ACTORS_COL] directors = self.movie_metadata[movie_id][DIRECTORS_COL] keywords = self.movie_metadata[movie_id][KEYWORDS_COL] movie_scores_ref = list() Recommender.add_score_to_list(genres, bias, movie_scores_ref) Recommender.add_score_to_list(languages, bias, movie_scores_ref) Recommender.add_score_to_list(actors, bias, movie_scores_ref) Recommender.add_score_to_list(directors, bias, movie_scores_ref) Recommender.add_score_to_list(keywords, bias, movie_scores_ref) movie_points_cosine = dict() for key, movie in self.movie_metadata.items(): if key == movie_id: continue movie_scores = list() Recommender.match_with_bias(movie[GENRES_COL], genres, bias, 1, movie_scores) Recommender.match_with_bias(movie[LANGUAGES_COL], languages, bias, 1, movie_scores) Recommender.match_with_bias(movie[ACTORS_COL], actors, bias, 1, movie_scores) Recommender.match_with_bias(movie[DIRECTORS_COL], directors, bias, 1, movie_scores) Recommender.match_with_bias(movie[KEYWORDS_COL], keywords, bias, 1, movie_scores) movie_points_cosine[key] = float( sm.cosine_similarity(movie_scores_ref, movie_scores)) recommendation = sorted(movie_points_cosine, key=lambda x: movie_points_cosine[x], reverse=True) return recommendation[:5]
def get_similar_movies_based_on_itemRating(self, input_movie_title): logging.debug( f'[{MovieRecommendationItemRating.get_similar_movies_based_on_itemRating.__name__}] - ' f'start of function with movie title <{input_movie_title}>') movie_title, model, df_movies = self.data_initialization(self) movie_title_list = list(movie_title) movie_index = movie_title_list.index(input_movie_title) sim_scores = list(enumerate(model[movie_index])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:6] movie_indices = [i[0] for i in sim_scores] similar_movies = pd.DataFrame() similar_movies['title'] = movie_title[movie_indices] logging.debug( f'[{MovieRecommendationItemRating.get_similar_movies_based_on_itemRating.__name__}] - item-rating executed' ) return similar_movies.to_dict()
def _title_to_image_url(movie_title: str, comma_check: bool = False) -> str: """ Transforms the given movie title into an image url. Removes year and if comma_check is True, also transforms the movie title @param movie_title: movie title to be transformed into an image url @param comma_check: if True, the comma gets removed and the str part behind the comma appended to the beginning of the movie title @return: an image url, where the poster of the movie can be found """ logging.debug(f'[{_title_to_image_url.__name__}] - start of function') temp_string: str = _remove_year_from_title(movie_title) if comma_check: temp_string = _comma_check(temp_string) temp_string = temp_string.replace(BLANK, PLUS) image_url: str = random.choice(API_KEY_PREFIX) + temp_string logging.debug(f'[{_title_to_image_url.__name__}] - image url: {image_url}') return image_url
def data_initialization(self, movie_dataset): logging.debug( f'[{self.data_initialization.__name__}] - start of function with movie dataset: {movie_dataset}' ) # reading the movies dataset movie_list = pd.read_csv(movie_dataset, encoding="Latin1") genre_list = "" for index, row in movie_list.iterrows(): genre_list += row.genres + "|" # split the string into a list of values genre_list_split = genre_list.split('|') # de-duplicate values new_list = list(set(genre_list_split)) # remove the value that is blank new_list.remove('') # Enriching the movies dataset by adding the various genres columns. movies_with_genres = movie_list.copy() for genre in new_list: movies_with_genres[genre] = movies_with_genres.apply( lambda _: int(genre in _.genres), axis=1) # Getting the movies list with only genres like Musical and other such columns movie_content_df_temp = movies_with_genres.copy() movie_content_df_temp.set_index('movieId') movie_content_df = movie_content_df_temp.drop( columns=['movieId', 'title', 'genres']) movie_content_df = movie_content_df.values logging.debug( f'[{self.data_initialization.__name__}] - movie content dataframe: {movie_content_df}' ) # Compute the cosine similarity matrix cosine_sim = linear_kernel(movie_content_df, movie_content_df) return cosine_sim, movie_content_df_temp
def _comma_check(movie_title: str) -> str: """ Checks if a comma separates the movie title, usually, if that is the case, the part after the comma must be appended to the beginning of the movie_title str @param movie_title: will be checked for a comma in the string @return: the comma corrected movie title if a comma is found in the string """ logging.debug(f'[{_comma_check.__name__}] - start of function') if COMMA in movie_title: logging.debug(f'[{_comma_check.__name__}] - comma found in {movie_title}') temp_list = movie_title.split(COMMA) # remove all blanks before and after a word for index, value in enumerate(temp_list): temp_list[index] = value.strip() temp_str = temp_list.pop() temp_list.insert(0, temp_str) concatenated_str: str = BLANK.join(temp_list) return concatenated_str else: logging.debug(f'[{_comma_check.__name__}] - comma not found in {movie_title}') return movie_title
def home(request): if request.method == 'POST': logging.debug( f'[{home.__name__}] - start with POST request: {request}') # Extract the search query from page data = request.POST.copy() # Query saved in Textfield search_query = data.get('movieTextField') logging.debug(f'[{home.__name__}] - search query: {search_query}') # Match the search_query and return the result results = match_strings(search_query) temp_dict = dict(results) for key in temp_dict.keys(): temp_dict[key] = mp.get_image_url(key) return render(request, "index.html", {"results": temp_dict}) # No search query yet entered logging.debug('No search query entered at the moment') return render(request, "index.html", {})
def get_image_url(movie_title: str) -> str: """ @param movie_title: the movie title for which an image url will be retrieved from OMDb @return: the image url, where the poster of the movie can be found. Note: Can be of type None. """ if type(movie_title) is not str: logging.error(f'[{get_image_url.__name__}] - movie_title in parameter is not of type str') return None logging.debug(f'[{get_image_url.__name__}] - start function with movie title: {movie_title}') image_url: str = _title_to_image_url(movie_title=movie_title, comma_check=True) json_response = _get_json_response(image_url) # comma corrected movie_title did receive a response if json_response is not None: logging.debug(f'[{get_image_url.__name__}] - json response received with comma correction') return json_response # try without comma correction else: logging.debug(f'[{get_image_url.__name__}] - json response received without comma correction') image_url = _title_to_image_url(movie_title=movie_title, comma_check=False) return _get_json_response(image_url)
def read_model_content_data(self): logging.debug( f'[{self.read_model_content_data.__name__}] - start of function') df = pd.read_csv('resources/movie_content.csv', encoding="utf-8") return model_data, df
def read_files(self): logging.debug(f'[{self.read_files.__name__}] - reading csv files') df_movies = pd.read_csv('resources/movies.csv', encoding="Latin1") df_ratings = pd.read_csv('resources/ratings.csv', usecols=['userId', 'movieId', 'rating']) return df_movies, df_ratings
def recommendation(request): if request.method == 'POST': logging.debug( f'[{recommendation.__name__}] - start function with request: {request}' ) # The movie title is the value of the selected submit button in the form selection_query = request.POST['submit'] # Again needs to be mapped to the actual movie object as only the string is provided selection = map_string_to_movie(selection_query) # ID for different algorithms to work selection_id = selection.iloc[0][MOVIE_ID] # Title to show the user as the selected movie selection_title = selection.iloc[0][TITLE] print('selection id {}, selection title {}'.format( selection_id, selection_title)) # Results of different algorithms rec = recommender.Recommender() movies_metadata: list = rec.metadata_recommender(selection_id) movies_keywords: list = rec.metadata_recommender_with_keywords( selection_id) rec_obj = MovieRecommendationItemRating() movies_item_rating = rec_obj.get_similar_movies_based_on_itemRating( rec_obj, selection_title) obj_rec = MovieRecommendationByGenre() movies_genres = obj_rec.get_similar_movies_based_on_genre( selection_title) obj = MovieRecommendationByTags() movies_tags = obj.get_similar_movies_based_on_tags(selection_title) selection_tuple: tuple = (selection_title, mp.get_image_url(selection_title)) try: alg1: dict = dict() alg2: dict = dict() alg3: dict = dict() alg4: dict = dict() alg5: dict = dict() movieList = [ movies_metadata, movies_keywords, movies_item_rating, movies_genres, movies_tags ] alg_list = [alg1, alg2, alg3, alg4, alg5] with concurrent.futures.ThreadPoolExecutor( max_workers=5) as executor: executor.map(_get_views_dict, movieList, alg_list) return render( request, "recommendations.html", { "selection_title": selection_tuple, "alg1": alg1, "alg2": alg2, "alg3": alg3, "alg4": alg4, "alg5": alg5 }) except Exception as excError: logging.error( f'An error occurred during the recommendation process with error: {excError}' ) return render(request, "error.html", {"error": excError})