Exemple #1
0
def recommendations():
    rec = Recommender()
    rec.import_cosine_sim_from_pkl()
    movies_enhanched = []
    msg = ''
    if request.method == "POST":
        tconst_id = request.form['tconst']
        dbhandler = DbHandler()
        movie_title = dbhandler.exec_sql_cmd(
            f"SELECT primaryTitle FROM title_basics WHERE tconst = '{tconst_id}'"
        )
        msg = f"Recommendations for '{movie_title[0][0]}'"
        print(msg)
        movies = rec.get_recommendation_titles_from_tconst(tconst=tconst_id,
                                                           limit=20)
        for movie in movies[1:]:
            movies_enhanched.append({
                "tconst":
                movie[0],
                "title":
                movie[1],
                "imdburl":
                f"https://www.imdb.com/title/{movie[0]}/"
            })
    return render_template("recommendations.html",
                           name="Apostolis",
                           movies=movies_enhanched,
                           result_message=msg)
 def test_get_titles_from_tconst(self):
     """
     Test the get_titles_from_tconst_list function
     Grab the recommendation titles for a random movie and crosscheck with the titles in the title_basics table
     :return:
     """
     rec = Recommender()
     rec.import_cosine_sim_from_pkl()
     sample_tconst = random.choice(list(
         rec.cosine_sim.tconst.values))  # grab a random movie to test
     sample_recommendations_tconst = rec.get_recommendation_from_tconst(
         sample_tconst)
     sample_titles = get_titles_from_tconst_list(
         sample_recommendations_tconst)
     recommendation_titles = [result[1] for result in sample_titles]
     # Make sure the titles returned are a subset of the titles in title_basics table
     dbhandler = DbHandler()
     dbhandler.connect()
     # Use a join to limit the results only on those that interest us
     all_titles = dbhandler.conn.execute(
         text(
             "SELECT tconst,primaryTitle from title_basics NATURAL JOIN title_keywords"
         ))
     # Convert the result of the query to a df
     all_titles_df = pd.DataFrame(data=[row for row in all_titles],
                                  columns=['tconst', 'primaryTitle'])
     # Find the titles that exist in both the df from the db and the results from the recommender
     same_titles = all_titles_df.loc[all_titles_df['tconst'].isin(
         sample_recommendations_tconst)]['primaryTitle']
     # Make sure we found all the movie titles
     assert len(recommendation_titles) == same_titles.size
 def insert_to_db(self):
     myDbHandler = DbHandler()
     myDbHandler.connect()
     self.filtered_df.to_sql("title_basics",
                             myDbHandler.conn,
                             if_exists='append',
                             index=False)
     return
 def test_execute_select_from_sql(self):
     handler = DbHandler()
     handler.connect()
     try:
         results = handler.exec_select_sql_from_file(os.path.join(DATA_PATH, "sql/select_actors.sql"))
         print([r['tconst'] for r in results][:2])
     except Exception as e:
         assert False, e
     return
Exemple #5
0
 def __init__(self):
     self.dbhandler = DbHandler()
     self.dbhandler.connect()
     data = self.dbhandler.exec_select_sql_from_file(
         os.path.join(DATA_PATH, "sql/select_soup.sql"))
     self.df = pd.DataFrame(data=data, columns=['tconst', 'soup'])
     self.cosine_sim = None
     self.cosine_sim_csv_path = os.path.join(DATA_PATH,
                                             'cosine_sim/cosine_sim.csv')
     return
 def filter_foreign_keys(self):
     myDbHandler = DbHandler()
     myDbHandler.connect()
     nconst_ids = [
         row["nconst"] for row in myDbHandler.conn.execute(
             text("SELECT DISTINCT nconst \ "
                  "FROM title_principals"))
     ]
     self.filtered_df = self.filtered_df[self.filtered_df.nconst.isin(
         nconst_ids)]
     return
 def test_execute_query(self):
     """
     Run a sample query to make sure the connection is indeed correct and the schema is created
     :return:
     """
     handler = DbHandler()
     handler.connect()
     try:
         handler.conn.execute(text("SELECT table_name FROM information_schema.tables \
         WHERE table_schema = 'movie_recommender';"))
     except Exception as e:
         assert False, e
 def insert_to_db(self):
     """
     Insert the filtered_df in the db, tablename: title_ratings
     :return:
     """
     myDbHandler = DbHandler()
     myDbHandler.connect()
     self.filtered_df.to_sql("title_ratings",
                             myDbHandler.conn,
                             if_exists='append',
                             index=False)
     return
 def __init__(self):
     myDbHandler = DbHandler()
     keywords_list = myDbHandler.exec_select_sql_from_file(
         os.path.join(DATA_PATH, 'sql/select_genres_keywords.sql'))
     self.keywords_df = pd.DataFrame(
         keywords_list, columns=['tconst', 'genres', 'keywords'])
     print(self.keywords_df.head())
     actors_list = myDbHandler.exec_select_sql_from_file(
         os.path.join(DATA_PATH, 'sql/select_actors.sql'))
     self.actors_df = pd.DataFrame(actors_list,
                                   columns=['tconst', 'PrimaryName'])
     print(self.actors_df.head())
     self.soup_df = pd.DataFrame(columns=['tconst', 'soup'])
 def filter_foreign_keys(self):
     """
     Keep only the tconst ids that are in the title_basics table
     :return:
     """
     myDbHandler = DbHandler()
     myDbHandler.connect()
     tconst_ids = [
         row["tconst"] for row in myDbHandler.conn.execute(
             text("SELECT tconst FROM title_basics"))
     ]
     self.filtered_df = self.filtered_df[self.filtered_df.tconst.isin(
         tconst_ids)]
     return
 def filter_foreign_keys(self):
     myDbHandler = DbHandler()
     myDbHandler.connect()
     tconst_ids = [
         row["tconst"] for row in myDbHandler.conn.execute(
             text("SELECT tconst FROM title_basics"))
     ]
     self.filtered_df = self.filtered_df[self.filtered_df.tconst.isin(
         tconst_ids)]
     self.filtered_df = self.filtered_df.groupby(['tconst', 'nconst'
                                                  ]).size().reset_index()
     columns_to_keep = ['tconst', 'nconst']
     self.filtered_df = self.filtered_df[columns_to_keep]
     return
 def insert_to_db(self) -> None:
     """
     Assuming the group_actors function is already called and the self.soup_df df is created, insert it
     in the title_soup table
     :return:
     """
     myDbHandler = DbHandler()
     myDbHandler.connect()
     self.soup_df.to_sql("title_soup",
                         myDbHandler.conn,
                         if_exists='append',
                         index=False)
     print("Successfully inserted values in the db")
     return
Exemple #13
0
def get_titles_from_tconst_list(tconst_list: list) -> list:
    """

    :param tconst_list: list of tconst ids, normally generated via get_recommendation_from_tconst
    :return: list of tuples, where first field is the tconst and second field is the title from the title_basics table
    """
    dbhandler = DbHandler()
    dbhandler.connect()
    all_titles = [
        row for row in dbhandler.conn.execute(
            sql_text(f"SELECT tconst, primaryTitle FROM "
                     f"title_basics NATURAL JOIN title_soup"))
    ]
    all_titles_df = pd.DataFrame(data=all_titles,
                                 columns=['tconst', 'primaryTitle'])
    return [(i, all_titles_df.loc[all_titles_df['tconst'] == i]
             ['primaryTitle'].values[0]) for i in tconst_list]
Exemple #14
0
def index():
    enhanced_movies = []
    if request.method == "GET":
        msg = "Provide a title to search for a movie"
    if request.method == "POST":
        movie_param = request.form["title"]
        msg = f"Results for '{movie_param}'"
        if movie_param.strip():  # if movie_param is not whitespace
            dbhandler = DbHandler()
            sql_cmd = f"select tconst, startYear,primaryTitle, genres FROM title_basics " \
                      f"natural join title_keywords WHERE primaryTitle LIKE '%{movie_param}%' LIMIT 20"
            movies = dbhandler.exec_sql_cmd(sql_cmd=sql_cmd)
            for movie in movies:
                movie_dict = {field: movie[field] for field in movie._fields}
                movie_dict[
                    'imdburl'] = f"https://www.imdb.com/title/{movie['tconst']}/"
                enhanced_movies.append(movie_dict)
        if not enhanced_movies:
            msg = "No results where found, please try again"
    return render_template("index.html",
                           movies=enhanced_movies,
                           result_message=msg)
 def test_handler_init(self):
     handler = DbHandler()
     assert handler.host is not None and handler.password is not None and handler.username is not None \
         and handler.db is not None
Exemple #16
0
class Recommender:
    def __init__(self):
        self.dbhandler = DbHandler()
        self.dbhandler.connect()
        data = self.dbhandler.exec_select_sql_from_file(
            os.path.join(DATA_PATH, "sql/select_soup.sql"))
        self.df = pd.DataFrame(data=data, columns=['tconst', 'soup'])
        self.cosine_sim = None
        self.cosine_sim_csv_path = os.path.join(DATA_PATH,
                                                'cosine_sim/cosine_sim.csv')
        return

    def create_cosine_sim(self) -> None:
        """
        Creates the cosine_sim matrix from the self.df dataframe
        This is the method that does all the raw calculation, and this only needs to be done
        when the dataset changes or the cosine_csv.csv file is missing for some reason
        Otherwise it is better to just import the cosine_csv.csv file that is generated by an earlier
        use of this method
        :return:
        """
        cv = CountVectorizer()
        count_matrix = cv.fit_transform(self.df['soup'])
        cosine_sim = cosine_similarity(count_matrix)
        tconst_array = self.df['tconst'].values
        cosine_sim_df = pd.DataFrame(data=cosine_sim)
        cosine_sim_df.insert(0, column='tconst', value=tconst_array)
        self.cosine_sim = cosine_sim_df
        return

    def export_cosine_sim_to_pkl(self,
                                 pkl_path=os.path.join(
                                     DATA_PATH, 'cosine_sim/cosine_sim.pkl')):
        """
        Exports the self.cosine_sim dataframe to the specified  path for later imports
        :param pkl_path: the path of the pkl file to export the dataframe
        :return:
        """
        # Initialize the self.cosine_sim df
        self.create_cosine_sim()
        print(self.cosine_sim.head())
        print("Starting to write...")
        start_time = time.time()
        self.cosine_sim.to_pickle(pkl_path)
        print("Finished writing...")
        end_time = time.time()
        print(f"Elapsed {end_time - start_time} s")
        return

    def import_cosine_sim_from_pkl(self,
                                   pkl_path=os.path.join(
                                       DATA_PATH, 'cosine_sim/cosine_sim.pkl'),
                                   auto_create=False):
        if not os.path.exists(pkl_path):
            if not auto_create:  # Throw exception, we shouldn't create the file that doesn't exist
                print(f"Can't find file with path {pkl_path}, exiting")
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), pkl_path)
            else:  # we should call export_cosine_sim_to_pkl to create the file
                self.create_cosine_sim()
                self.export_cosine_sim_to_pkl(pkl_path=pkl_path)
        self.cosine_sim = pd.read_pickle(pkl_path)
        return

    def get_tconst_from_idx(self, idx: int) -> str:
        return self.df[self.df.index == idx]["tconst"].values[0]

    def get_index_from_tconst(self, tconst) -> int:
        """
        Returns the index of the row the tconst param corresponds to in the self.df dataframe
        Throws an Exception if the tconst value is not found
        :param tconst: str the tconst to look for in the df
        :return: int the index of the row found
        """
        if not (tconst in self.cosine_sim['tconst'].values):
            raise Exception(
                f"tconst can't be found in tconst values of self.df, {tconst}")
        return self.cosine_sim[self.cosine_sim['tconst'] ==
                               tconst].index.values[0]

    def get_recommendation_from_tconst(self, tconst: str, limit=10) -> list:
        """
        Assumes the self.cosine_sim is set (imported via import_cosine_sim_from_csv or generated via
        create_cosine_sim)
        :param tconst: str the tconst of the value we are looking for
        :param limit: int optional the number of results to be included, default is 10
        :return: sorted_tconst: list of tconst id recommendations in a sorted order, starting from most similar
        """
        tconst_idx = self.get_index_from_tconst(
            tconst)  # get the index of the movie
        # print(tconst_idx)
        movie_recommendations = list(enumerate(self.cosine_sim[tconst_idx]))
        # sort the values by the similarity in desc order
        sorted_movie_recommendations = sorted(movie_recommendations,
                                              key=lambda x: x[1],
                                              reverse=True)
        sorted_tconst = [
            self.get_tconst_from_idx(movie[0])
            for movie in sorted_movie_recommendations[:limit]
        ]
        # print(sorted_tconst)
        return sorted_tconst

    def get_recommendation_titles_from_tconst(self, tconst: str, limit=10):
        return get_titles_from_tconst_list(
            self.get_recommendation_from_tconst(tconst, limit))
 def test_handler_connection(self):
     handler = DbHandler()
     handler.connect()
     assert handler.conn is not None