Exemple #1
0
class CoactorCoactorMatrix(object):
    """
    Class to compute the Coactor Matrix which represents the number of movies each pair of actors have acted in, together
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)

    def fetchCoactorCoactorSimilarityMatrix(self):
        """
        Creates the coactor matrix with all the actors in a given set
        :return: coactor matrix
        """
        movie_actor_df = self.data_extractor.get_movie_actor_data()
        movie_actor_set_df = movie_actor_df.groupby(
            ['actorid'])["movieid"].apply(set).reset_index()
        num_of_actors = len(movie_actor_df.actorid.unique())
        coactor_matrix = [[0] * num_of_actors for i in range(num_of_actors)]
        for index, movie_set in zip(movie_actor_set_df.index,
                                    movie_actor_set_df.movieid):
            for index_2, movie_set_2 in zip(movie_actor_set_df.index,
                                            movie_actor_set_df.movieid):
                if index != index_2:
                    coactor_matrix[index][index_2] = len(
                        movie_set.intersection(movie_set_2))

        numpy.savetxt("coactor_coactor_matrix.csv",
                      coactor_matrix,
                      delimiter=",")
        return coactor_matrix, movie_actor_set_df.actorid.unique()
class LdaGenreActor(GenreTag):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def get_lda_data(self, genre):
        """
        Does LDA on movie-actor counts and outputs movies in terms of latent semantics as U
        and actor in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """

        # Getting movie_genre_data
        movie_genre_data_frame = self.data_extractor.get_mlmovies_data()
        movie_genre_data_frame = self.split_genres(movie_genre_data_frame)

        # Getting actor_movie_data
        movie_actor_data_frame = self.data_extractor.get_movie_actor_data()

        genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid",
                                                         right_on="movieid")
        # genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index()
        genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]]

        genre_actor_frame["actorid_string"] = pd.Series(
            [str(id) for id in genre_actor_frame.actorid],
            index=genre_actor_frame.index)

        genre_data_frame = genre_actor_frame[genre_actor_frame["genre"]==genre]
        actor_df = genre_data_frame.groupby(['movieid'])['actorid_string'].apply(list).reset_index()
        actor_df = actor_df.sort_values('movieid')
        actor_df.to_csv('movie_actor_lda.csv', index=True, encoding='utf-8')

        actor_df = list(actor_df.iloc[:,1])

        (U, Vh) = util.LDA(actor_df, num_topics=4, num_features=1000)

        for latent in Vh:
            print ("\n")
            print(latent)
class SvdGenreActor(GenreTag):
    """
            Class to relate Genre and Actor, inherits the ActorTag to use the common weighing functons
    """

    def __init__(self):
        """
        Initialiazing the data extractor object to get data from the csv files
        """
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def split_genres(self, data_frame):
        """
        This function extractors genres from each row and converts into independent rows
        :param data_frame:
        :return: data frame with multiple genres split into different rows
        """
        genre_data_frame = data_frame['genres'].str.split('|', expand=True).stack()
        genre_data_frame.name = "genre"
        genre_data_frame.index = genre_data_frame.index.droplevel(-1)
        genre_data_frame = genre_data_frame.reset_index()
        data_frame = data_frame.drop("genres", axis=1)
        data_frame = data_frame.reset_index()
        data_frame = genre_data_frame.merge(data_frame, how="left", on="index")
        return data_frame

    def assign_rank_weight(self, data_frame):
        """
        This function assigns a value for all the actors in a movie on a scale of 100,
         based on their rank in the movie.
        :param tag_series:
        :return: dictionary of (movieid, actor_rank) to the computed rank_weight
        """
        groupby_movies = data_frame.groupby("movieid")
        movie_rank_weight_dict = {}
        for movieid, info_df in groupby_movies:
           max_rank = info_df.actor_movie_rank.max()
           for rank in info_df.actor_movie_rank.unique():
             movie_rank_weight_dict[(movieid, rank)] = (max_rank - rank + 1)/max_rank*100
        return movie_rank_weight_dict

    def assign_idf_weight(self, data_frame, unique_actors):
        """
        This function computes the idf weight for all tags in a data frame,
        considering each movie as a document
        :param data_frame:
        :param unique_tags:
        :return: dictionary of tags and idf weights
        """
        idf_counter = {actorid_string: 0 for actorid_string in unique_actors}
        data_frame.actorid_string = pd.Series([set(actors.split(',')) for actors in data_frame.actorid_string], index=data_frame.index)
        for actor_list in data_frame.actorid_string:
            for actorid_string in actor_list:
                idf_counter[actorid_string] += 1
        for actorid_string, count in list(idf_counter.items()):
            idf_counter[actorid_string] = math.log(len(data_frame.index)/count)
        return idf_counter

    def assign_tf_weight(self, actor_series):
        """
        This function computes the tf weight for all tags for a movie
        :param tag_series:
        :return: dictionary of tags and tf weights
        """
        counter = Counter()
        for each in actor_series:
            counter[each] += 1
        total = sum(counter.values())
        for each in counter:
            counter[each] = (counter[each]/total)
        return dict(counter)

    def get_model_weight(self, tf_weight_dict, idf_weight_dict, rank_weight_dict, actor_df, model):
        """
               This function combines tf_weight on a scale of 100, idf_weight on a scale of 100,
               and timestamp_weight on a scale of 10 , based on the model.
               :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model
               :return: data_frame with column of the combined weight
        """
        if model == "TF":
            actor_df["value"] = pd.Series(
                [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*100) + rank_weight_dict.get((movieid, rank), 0)) for
                 index, ts_weight, actorid_string, movieid, rank
                 in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)],
                index=actor_df.index)
        else:
            actor_df["value"] = pd.Series(
                [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*(idf_weight_dict.get(actorid_string, 0))*100) + rank_weight_dict.get((movieid, rank), 0)) for
                 index, ts_weight, actorid_string, movieid, rank
                 in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)],
                index=actor_df.index)
        return actor_df

    def combine_computed_weights(self, data_frame, rank_weight_dict, model, genre):
        """
                Triggers the weighing process and sums up all the calculated weights for each tag
                :param data_frame:
                :param rank_weight_dict:
                :param model:
                :return: dictionary of tags and weights
        """
        actor_df = data_frame.reset_index()
        temp_df = data_frame[data_frame["genre"]==genre]
        unique_actors = actor_df.actorid_string.unique()
        idf_data = actor_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index()
        tf_df = temp_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index()
        movie_actor_dict = dict(zip(tf_df.movieid, tf_df.actorid_string))
        tf_weight_dict = {movie: self.assign_tf_weight(actorid_string.split(',')) for movie, actorid_string in
                          list(movie_actor_dict.items())}
        idf_weight_dict = {}
        if model != 'TF':
            idf_weight_dict = self.assign_idf_weight(idf_data, unique_actors)
        actor_df = self.get_model_weight(tf_weight_dict, idf_weight_dict, rank_weight_dict, temp_df, model)
        actor_df["total"] = actor_df.groupby(['actorid_string'])['value'].transform('sum')
        actor_df = actor_df.drop_duplicates("actorid_string").sort_values("total", ascending=False)
        #actor_tag_dict = dict(zip(tag_df.tag, tag_df.total))
        return actor_df

    def get_genre_actor_data_frame(self):
        """
        Function to merge mutiple tables and get the required dataframe for tf-idf calculation
        :return: dataframe
        """
        # Getting movie_genre_data
        movie_genre_data_frame = self.data_extractor.get_mlmovies_data()
        movie_genre_data_frame = self.split_genres(movie_genre_data_frame)

        # Getting actor_movie_data
        movie_actor_data_frame = self.data_extractor.get_movie_actor_data()

        genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid", right_on="movieid")
        #genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index()
        genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]]
        genre_actor_frame = genre_actor_frame.sort_values("year", ascending=True)

        data_frame_len = len(genre_actor_frame.index)
        genre_actor_frame["year_weight"] = pd.Series(
            [(index + 1) / data_frame_len * 10 for index in genre_actor_frame.index],
            index=genre_actor_frame.index)

        genre_actor_frame["actorid_string"] = pd.Series(
            [str(id) for id in genre_actor_frame.actorid],
            index = genre_actor_frame.index)

        return genre_actor_frame

    def svd_genre_actor(self, genre):
        """
        Does SVD on movie-actor matrix and outputs movies in terms of latent semantics as U
        and actors in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """
        genre_actor_frame = self.get_genre_actor_data_frame()
        rank_weight_dict = self.assign_rank_weight(genre_actor_frame[['movieid', 'actor_movie_rank']])
        genre_actor_frame = self.combine_computed_weights(genre_actor_frame, rank_weight_dict, "TFIDF", genre)
        temp_df = genre_actor_frame[["movieid", "actorid_string", "total"]].drop_duplicates()
        genre_actor_tfidf_df = temp_df.pivot(index='movieid', columns='actorid_string', values='total')
        genre_actor_tfidf_df = genre_actor_tfidf_df.fillna(0)

        genre_actor_tfidf_df.to_csv('genre_actor_matrix.csv', index=True, encoding='utf-8')

        df = pd.DataFrame(pd.read_csv('genre_actor_matrix.csv'))
        df1 = genre_actor_tfidf_df.values[:, :]
        row_headers = list(df["movieid"])
        column_headers = list(df)
        del column_headers[0]

        column_headers_names = []

        for col_head in column_headers:
            col_head_name = util.get_actor_name_for_id(int(col_head))
            column_headers_names = column_headers_names + [col_head_name]

        (U, s, Vh) = util.SVD(df1)

        # To print latent semantics
        latents = util.get_latent_semantics(4, Vh)
        util.print_latent_semantics(latents, column_headers_names)

        u_frame = pd.DataFrame(U[:, :4], index=row_headers)
        v_frame = pd.DataFrame(Vh[:4, :], columns=column_headers)
        u_frame.to_csv('u_1b_svd.csv', index=True, encoding='utf-8')
        v_frame.to_csv('vh_1b_svd.csv', index=True, encoding='utf-8')
        return (u_frame, v_frame, s)
Exemple #4
0
class SimilarActorsFromDiffMovies(ActorActorMatrix):

    def __init__(self):
        """
        Initialiazing the data extractor object to get data from the csv files
        """
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        actor_actor_matrix_obj.fetchActorActorSimilarityMatrix()

    def get_actors_of_movie(self, moviename):
        """
        Function to return the actors of a given movie
        :param moviename:
        :return: list(actorids)
        """
        actor_movie_table = self.data_extractor.get_movie_actor_data()
        movieid = util.get_movie_id(moviename)
        actor_movie_table = actor_movie_table[actor_movie_table['movieid']== movieid]
        actorids = actor_movie_table["actorid"].tolist()
        return actorids

    def get_movie_tag_matrix(self):
        """
        Function to get movie_tag matrix containing list of tags in each movie
        :return: movie_tag_matrix
        """
        data_frame = genre_tag.get_genre_data()
        tag_df = data_frame.reset_index()
        unique_tags = tag_df.tag.unique()
        idf_data = tag_df.groupby(['movieid'])['tag'].apply(set)
        tf_df = tag_df.groupby(['movieid'])['tag'].apply(lambda x: ','.join(x)).reset_index()
        movie_tag_dict = dict(zip(tf_df.movieid, tf_df.tag))
        tf_weight_dict = {movie: genre_tag.assign_tf_weight(tags.split(',')) for movie, tags in
                          list(movie_tag_dict.items())}
        idf_weight_dict = {}
        idf_weight_dict = genre_tag.assign_idf_weight(idf_data, unique_tags)
        tag_df = genre_tag.get_model_weight(tf_weight_dict, idf_weight_dict, tag_df, 'tfidf')
        tag_df["total"] = tag_df.groupby(['movieid','tag'])['value'].transform('sum')
        temp_df = tag_df[["moviename", "tag", "total"]].drop_duplicates().reset_index()



        genre_tag_tfidf_df = temp_df.pivot_table('total', 'moviename', 'tag')
        genre_tag_tfidf_df = genre_tag_tfidf_df.fillna(0)
        genre_tag_tfidf_df.to_csv('movie_tag_matrix1d.csv', index=True, encoding='utf-8')
        return genre_tag_tfidf_df



    def get_movie_movie_vector(self, moviename):
        """
        Function which finds movie_movie_similarity_matrix in space of tags using tf-idf
        :param moviename:
        :return: row of vector of giver movie
        """

        movie_tag_frame = self.get_movie_tag_matrix()
        movie_tag_matrix = movie_tag_frame.values
        movies = list(movie_tag_frame.index.values)
        tags = list(movie_tag_frame)
        tag_movie_matrix = movie_tag_matrix.transpose()
        movie_movie_matrix = numpy.dot(movie_tag_matrix, tag_movie_matrix)

        index_movie = None
        for i,j in enumerate(movies):
            if j == moviename:
                index_movie = i
                break

        if index_movie==None:
            print ("Movie Id not found.")
            return None

        movie_row = movie_movie_matrix[index_movie].tolist()
        movie_movie_dict = dict(zip(movies, movie_row))
        del movie_movie_dict[moviename]
        movie_movie_dict = sorted(movie_movie_dict.items(), key=operator.itemgetter(1), reverse=True)
        return movie_movie_dict

    def most_similar_actors(self, moviename):
        """
        Function to find similar actors from similar movies
        :param moviename:
        :return: actors
        """
        movieid = util.get_movie_id(moviename)
        movie_movie_dict = self.get_movie_movie_vector(moviename)
        if movie_movie_dict == None:
            return None
        actors = []
        for (movie,val) in movie_movie_dict:
            if val <= 0:
                break
            movieid = util.get_movie_id(movie)
            actors = actors + self.get_actors_of_movie(movie)
            if len(actors) >= 10:
                break

        actors_of_given_movie = self.get_actors_of_movie(moviename)

        actorsFinal = [x for x in actors if x not in actors_of_given_movie]

        actornames = []
        for actorid in actorsFinal:
            actor = util.get_actor_name_for_id(actorid)
            actornames.append(actor)

        return actornames
class ActorMovieYearTensor(object):

    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.ordered_years = []
        self.ordered_movie_names = []
        self.ordered_actor_names = []
        self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
        self.util = Util()
        self.tensor = self.fetchActorMovieYearTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchActorMovieYearTensor(self):
        """
        Create actor movie year tensor
        :return: tensor
        """
        movies_df = self.data_extractor.get_mlmovies_data()
        actor_df = self.data_extractor.get_movie_actor_data()

        movie_actor_df = actor_df.merge(movies_df, how="left", on="movieid")
        year_list = movie_actor_df["year"]
        year_count = 0
        year_dict = {}
        for element in year_list:
            if element in year_dict.keys():
                continue
            year_dict[element] = year_count
            year_count += 1
            self.ordered_years.append(element)

        movieid_list = movie_actor_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        actorid_list = movie_actor_df["actorid"]
        actorid_count = 0
        actorid_dict = {}
        for element in actorid_list:
            if element in actorid_dict.keys():
                continue
            actorid_dict[element] = actorid_count
            actorid_count += 1
            name = self.util.get_actor_name_for_id(element)
            self.ordered_actor_names.append(name)

        tensor = np.zeros((year_count, movieid_count, actorid_count))

        for index, row in movie_actor_df.iterrows():
            year = row["year"]
            movieid = row["movieid"]
            actorid = row["actorid"]
            year_id = year_dict[year]
            movieid_id = movieid_dict[movieid]
            actorid_id = actorid_dict[actorid]
            tensor[year_id][movieid_id][actorid_id] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
        Pretty print latent semantics
        :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics, self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
        Obtain factor names
        :param i:
        :return: factor names
        """
        if i == 0:
            return self.ordered_years
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_actor_names

    def get_partitions(self, no_of_partitions):
        """
        Partition factor matrices
        :param no_of_partitions:
        :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
        Pretty print groupings
        :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1
Exemple #6
0
class ActorTag(object):
    """
    Class to relate actors and tags.
    """
    def __init__(self):
        """
        Initializing the data extractor object to get data from the csv files
        """
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def assign_idf_weight(self, data_series, unique_tags):
        """
        This function computes the idf weight for all tags in a data frame,
        considering each movie as a document
        :param data_frame:
        :param unique_tags:
        :return: dictionary of tags and idf weights
        """
        idf_counter = {tag: 0 for tag in unique_tags}
        for tag_list in data_series:
            for tag in tag_list:
                idf_counter[tag] += 1
        for tag, count in list(idf_counter.items()):
            idf_counter[tag] = math.log(len(data_series.index) / count, 2)
        return idf_counter

    def assign_tf_weight(self, tag_series):
        """
        This function computes the tf weight for all tags for a movie
        :param tag_series:
        :return: dictionary of tags and tf weights
        """
        counter = Counter()
        for each in tag_series:
            counter[each] += 1
        total = sum(counter.values())
        for each in counter:
            counter[each] = (counter[each] / total)
        return dict(counter)

    def assign_rank_weight(self, data_frame):
        """
        This function assigns a value for all the actors in a movie on a scale of 100,
         based on their rank in the movie.
        :param tag_series:
        :return: dictionary of (movieid, actor_rank) to the computed rank_weight
        """
        groupby_movies = data_frame.groupby("movieid")
        movie_rank_weight_dict = {}
        for movieid, info_df in groupby_movies:
            max_rank = info_df.actor_movie_rank.max()
            for rank in info_df.actor_movie_rank.unique():
                movie_rank_weight_dict[(
                    movieid, rank)] = (max_rank - rank + 1) / max_rank * 100
        return movie_rank_weight_dict

    def get_model_weight(self, tf_weight_dict, idf_weight_dict,
                         rank_weight_dict, tag_df, model):
        """
        This function combines tf_weight on a scale of 100, idf_weight on a scale of 100,
        actor_rank for each tag on scale of 100 and timestamp_weight on a scale of 10 , based on the model.
        :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model
        :return: data_frame with column of the combined weight
        """
        if model == "TF":
            tag_df["value"] = pd.Series(
                [(tf_weight_dict.get(movieid, 0).get(tag, 0) * 100) +
                 rank_weight_dict.get((movieid, rank), 0)
                 for index, ts_weight, tag, movieid, rank in zip(
                     tag_df.index, tag_df.timestamp_weight, tag_df.tag,
                     tag_df.movieid, tag_df.actor_movie_rank)],
                index=tag_df.index)
        else:
            tag_df["value"] = pd.Series(
                [(ts_weight +
                  (tf_weight_dict.get(movieid, 0).get(tag, 0) *
                   (idf_weight_dict.get(tag, 0)) * 100) + rank_weight_dict.get(
                       (movieid, rank), 0))
                 for index, ts_weight, tag, movieid, rank in zip(
                     tag_df.index, tag_df.timestamp_weight, tag_df.tag,
                     tag_df.movieid, tag_df.actor_movie_rank)],
                index=tag_df.index)
        return tag_df

    def combine_computed_weights(self, data_frame, rank_weight_dict,
                                 idf_weight_dict, model):
        """
        Triggers the weighing process and sums up all the calculated weights for each tag
        :param data_frame:
        :param rank_weight_dict:
        :param model:
        :return: dictionary of tags and weights
        """
        tag_df = data_frame.reset_index()
        temp_df = tag_df.groupby(
            ['movieid'])['tag'].apply(lambda x: ','.join(x)).reset_index()
        movie_tag_dict = dict(zip(temp_df.movieid, temp_df.tag))
        tf_weight_dict = {
            movie: self.assign_tf_weight(tags.split(','))
            for movie, tags in list(movie_tag_dict.items())
        }
        tag_df = self.get_model_weight(tf_weight_dict, idf_weight_dict,
                                       rank_weight_dict, tag_df, model)
        tag_df["total"] = tag_df.groupby(['tag'])['value'].transform('sum')
        tag_df = tag_df.drop_duplicates("tag").sort_values("total",
                                                           ascending=False)
        actor_tag_dict = dict(zip(tag_df.tag, tag_df.total))
        return actor_tag_dict

    def merge_movie_actor_and_tag(self, actorid, model):
        """
        Merges data from different csv files necessary to compute the tag weights for each actor,
        assigns weights to timestamp.
        :param actorid:
        :param model:
        :return: returns a dictionary of Actors to dictionary of tags and weights.
        """
        mov_act = self.data_extractor.get_movie_actor_data()
        ml_tag = self.data_extractor.get_mltags_data()
        genome_tag = self.data_extractor.get_genome_tags_data()
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actor_movie_info = mov_act.merge(actor_info,
                                         how="left",
                                         left_on="actorid",
                                         right_on="id")
        tag_data_frame = ml_tag.merge(genome_tag,
                                      how="left",
                                      left_on="tagid",
                                      right_on="tagId")
        merged_data_frame = actor_movie_info.merge(tag_data_frame,
                                                   how="left",
                                                   on="movieid")
        merged_data_frame = merged_data_frame[
            merged_data_frame['timestamp'].notnull()]
        merged_data_frame = merged_data_frame.drop(["userid"], axis=1)
        rank_weight_dict = self.assign_rank_weight(
            merged_data_frame[['movieid', 'actor_movie_rank']])
        merged_data_frame = merged_data_frame.sort_values(
            "timestamp", ascending=True).reset_index()
        data_frame_len = len(merged_data_frame.index)
        merged_data_frame["timestamp_weight"] = pd.Series(
            [(index + 1) / data_frame_len * 10
             for index in merged_data_frame.index],
            index=merged_data_frame.index)
        if model == 'TFIDF':
            idf_weight_dict = self.assign_idf_weight(
                merged_data_frame.groupby('movieid')['tag'].apply(set),
                merged_data_frame.tag.unique())
            tag_dict = self.combine_computed_weights(
                merged_data_frame[merged_data_frame['actorid'] == actorid],
                rank_weight_dict, idf_weight_dict, model)
        else:
            tag_dict = self.combine_computed_weights(
                merged_data_frame[merged_data_frame['actorid'] == actorid],
                rank_weight_dict, {}, model)

        return tag_dict
Exemple #7
0
class LdaActorTag(object):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.util = Util()

    def get_related_actors_lda(self, actorid):
        """
        Function to find similarity between actors using actor-actor similarity vector in tag space using lda
        :param actorid:
        :return:
        """
        mov_act = self.data_extractor.get_movie_actor_data()
        ml_tag = self.data_extractor.get_mltags_data()
        genome_tag = self.data_extractor.get_genome_tags_data()
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actor_movie_info = mov_act.merge(actor_info,
                                         how="left",
                                         left_on="actorid",
                                         right_on="id")
        tag_data_frame = ml_tag.merge(genome_tag,
                                      how="left",
                                      left_on="tagid",
                                      right_on="tagId")
        merged_data_frame = tag_data_frame.merge(actor_movie_info,
                                                 how="left",
                                                 on="movieid")

        merged_data_frame = merged_data_frame.fillna('')
        tag_df = merged_data_frame.groupby(
            ['actorid'])['tag'].apply(list).reset_index()

        tag_df = tag_df.sort_values('actorid')
        actorid_list = tag_df.actorid.tolist()
        tag_df = list(tag_df.iloc[:, 1])

        (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=100000)

        actor_topic_matrix = self.util.get_doc_topic_matrix(
            U, num_docs=len(actorid_list), num_topics=5)
        topic_actor_matrix = actor_topic_matrix.transpose()
        actor_actor_matrix = numpy.dot(actor_topic_matrix, topic_actor_matrix)

        numpy.savetxt("actor_actor_matrix_with_svd_latent_values.csv",
                      actor_actor_matrix,
                      delimiter=",")

        df = pd.DataFrame(
            pd.read_csv('actor_actor_matrix_with_svd_latent_values.csv',
                        header=None))
        matrix = df.values

        actorids = self.util.get_sorted_actor_ids()

        index_actor = None
        for i, j in enumerate(actorids):
            if j == actorid:
                index_actor = i
                break

        if index_actor == None:
            print("Actor Id not found.")
            return None

        actor_names = []
        for actor_id in actorids:
            actor_name = self.util.get_actor_name_for_id(int(actor_id))
            actor_names = actor_names + [actor_name]

        actor_row = matrix[index_actor].tolist()
        actor_actor_dict = dict(zip(actor_names, actor_row))
        del actor_actor_dict[self.util.get_actor_name_for_id(int(actorid))]

        # for key in actor_actor_dict.keys():
        #     actor_actor_dict[key] = abs(actor_actor_dict[key])

        actor_actor_dict = sorted(actor_actor_dict.items(),
                                  key=operator.itemgetter(1),
                                  reverse=True)
        print(actor_actor_dict[0:10])
        return actor_actor_dict[0:10]