def __init__(self):
     myDbHandler = DbHandler()
     keywords_list = myDbHandler.exec_select_sql_from_file(
         os.path.join(DATA_PATH, 'sql/select_genres_keywords.sql'))
     self.keywords_df = pd.DataFrame(
         keywords_list, columns=['tconst', 'genres', 'keywords'])
     print(self.keywords_df.head())
     actors_list = myDbHandler.exec_select_sql_from_file(
         os.path.join(DATA_PATH, 'sql/select_actors.sql'))
     self.actors_df = pd.DataFrame(actors_list,
                                   columns=['tconst', 'PrimaryName'])
     print(self.actors_df.head())
     self.soup_df = pd.DataFrame(columns=['tconst', 'soup'])
 def test_execute_select_from_sql(self):
     handler = DbHandler()
     handler.connect()
     try:
         results = handler.exec_select_sql_from_file(os.path.join(DATA_PATH, "sql/select_actors.sql"))
         print([r['tconst'] for r in results][:2])
     except Exception as e:
         assert False, e
     return
Beispiel #3
0
class Recommender:
    def __init__(self):
        self.dbhandler = DbHandler()
        self.dbhandler.connect()
        data = self.dbhandler.exec_select_sql_from_file(
            os.path.join(DATA_PATH, "sql/select_soup.sql"))
        self.df = pd.DataFrame(data=data, columns=['tconst', 'soup'])
        self.cosine_sim = None
        self.cosine_sim_csv_path = os.path.join(DATA_PATH,
                                                'cosine_sim/cosine_sim.csv')
        return

    def create_cosine_sim(self) -> None:
        """
        Creates the cosine_sim matrix from the self.df dataframe
        This is the method that does all the raw calculation, and this only needs to be done
        when the dataset changes or the cosine_csv.csv file is missing for some reason
        Otherwise it is better to just import the cosine_csv.csv file that is generated by an earlier
        use of this method
        :return:
        """
        cv = CountVectorizer()
        count_matrix = cv.fit_transform(self.df['soup'])
        cosine_sim = cosine_similarity(count_matrix)
        tconst_array = self.df['tconst'].values
        cosine_sim_df = pd.DataFrame(data=cosine_sim)
        cosine_sim_df.insert(0, column='tconst', value=tconst_array)
        self.cosine_sim = cosine_sim_df
        return

    def export_cosine_sim_to_pkl(self,
                                 pkl_path=os.path.join(
                                     DATA_PATH, 'cosine_sim/cosine_sim.pkl')):
        """
        Exports the self.cosine_sim dataframe to the specified  path for later imports
        :param pkl_path: the path of the pkl file to export the dataframe
        :return:
        """
        # Initialize the self.cosine_sim df
        self.create_cosine_sim()
        print(self.cosine_sim.head())
        print("Starting to write...")
        start_time = time.time()
        self.cosine_sim.to_pickle(pkl_path)
        print("Finished writing...")
        end_time = time.time()
        print(f"Elapsed {end_time - start_time} s")
        return

    def import_cosine_sim_from_pkl(self,
                                   pkl_path=os.path.join(
                                       DATA_PATH, 'cosine_sim/cosine_sim.pkl'),
                                   auto_create=False):
        if not os.path.exists(pkl_path):
            if not auto_create:  # Throw exception, we shouldn't create the file that doesn't exist
                print(f"Can't find file with path {pkl_path}, exiting")
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT), pkl_path)
            else:  # we should call export_cosine_sim_to_pkl to create the file
                self.create_cosine_sim()
                self.export_cosine_sim_to_pkl(pkl_path=pkl_path)
        self.cosine_sim = pd.read_pickle(pkl_path)
        return

    def get_tconst_from_idx(self, idx: int) -> str:
        return self.df[self.df.index == idx]["tconst"].values[0]

    def get_index_from_tconst(self, tconst) -> int:
        """
        Returns the index of the row the tconst param corresponds to in the self.df dataframe
        Throws an Exception if the tconst value is not found
        :param tconst: str the tconst to look for in the df
        :return: int the index of the row found
        """
        if not (tconst in self.cosine_sim['tconst'].values):
            raise Exception(
                f"tconst can't be found in tconst values of self.df, {tconst}")
        return self.cosine_sim[self.cosine_sim['tconst'] ==
                               tconst].index.values[0]

    def get_recommendation_from_tconst(self, tconst: str, limit=10) -> list:
        """
        Assumes the self.cosine_sim is set (imported via import_cosine_sim_from_csv or generated via
        create_cosine_sim)
        :param tconst: str the tconst of the value we are looking for
        :param limit: int optional the number of results to be included, default is 10
        :return: sorted_tconst: list of tconst id recommendations in a sorted order, starting from most similar
        """
        tconst_idx = self.get_index_from_tconst(
            tconst)  # get the index of the movie
        # print(tconst_idx)
        movie_recommendations = list(enumerate(self.cosine_sim[tconst_idx]))
        # sort the values by the similarity in desc order
        sorted_movie_recommendations = sorted(movie_recommendations,
                                              key=lambda x: x[1],
                                              reverse=True)
        sorted_tconst = [
            self.get_tconst_from_idx(movie[0])
            for movie in sorted_movie_recommendations[:limit]
        ]
        # print(sorted_tconst)
        return sorted_tconst

    def get_recommendation_titles_from_tconst(self, tconst: str, limit=10):
        return get_titles_from_tconst_list(
            self.get_recommendation_from_tconst(tconst, limit))