Python DataExtractor.get_mlmovies_dataの例

プログラミング言語: Python

名前空間/パッケージ名: data_extractor

クラス/型: DataExtractor

メソッド/関数: get_mlmovies_data

hotexamples.comのコード掲載数: 6

Python DataExtractor.get_mlmovies_data - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdata_extractor.DataExtractor.get_mlmovies_dataの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

DataExtractor(30)

location_mapping(10)

get_movie_actor_data(7)

get_mlmovies_data(6)

choose_versions(5)

get_data_vectors(4)

extract(4)

get_variable_values_sets(4)

get_mltags_data(4)

get_genome_tags_data(4)

get_imdb_actor_info_data(3)

get_bugged_files_path(3)

save(2)

read(2)

location_title_to_id_mapping(2)

checkout_version(2)

get_selected_versions(2)

extract_data(2)

get_data(1)

prepare_dataset_for_task1(1)

load_train_labels(1)

load_valid_data(1)

load_valid_labels(1)

extractTextFromTagList(1)

extractText(1)

prepare_dataset_for_task3(1)

load_test_labels(1)

prepare_dataset_for_task6(1)

dropna(1)

create_dataset(1)

save_csv(1)

set_selected_config(1)

to_json(1)

load_train_data(1)

load_test_data(1)

get_details_from_results(1)

init_jira_commits(1)

import_matches(1)

get_versions_by_type(1)

get_task4_feedback_data(1)

get_task2_feedback_data(1)

collect_data(1)

get_race(1)

append_givenloc_to_list(1)

findAllTags(1)

get_mlratings_data(1)

getData(1)

get_all_files_prefixed_with(1)

get_bugged_methods_path(1)

get_files_bugged(1)

コード例 #1

ファイルを表示

ファイル: phase_2_task_1b.py プロジェクト: abhijithshreesh/MovieRecommenderSystem

class LdaGenreActor(GenreTag):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def get_lda_data(self, genre):
        """
        Does LDA on movie-actor counts and outputs movies in terms of latent semantics as U
        and actor in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """

        # Getting movie_genre_data
        movie_genre_data_frame = self.data_extractor.get_mlmovies_data()
        movie_genre_data_frame = self.split_genres(movie_genre_data_frame)

        # Getting actor_movie_data
        movie_actor_data_frame = self.data_extractor.get_movie_actor_data()

        genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid",
                                                         right_on="movieid")
        # genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index()
        genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]]

        genre_actor_frame["actorid_string"] = pd.Series(
            [str(id) for id in genre_actor_frame.actorid],
            index=genre_actor_frame.index)

        genre_data_frame = genre_actor_frame[genre_actor_frame["genre"]==genre]
        actor_df = genre_data_frame.groupby(['movieid'])['actorid_string'].apply(list).reset_index()
        actor_df = actor_df.sort_values('movieid')
        actor_df.to_csv('movie_actor_lda.csv', index=True, encoding='utf-8')

        actor_df = list(actor_df.iloc[:,1])

        (U, Vh) = util.LDA(actor_df, num_topics=4, num_features=1000)

        for latent in Vh:
            print ("\n")
            print(latent)

コード例 #2

ファイルを表示

ファイル: util.py プロジェクト: aditya-chayapathy/movie-data-vector-space-modelling

class Util(object):
    """
    Class containing all the common utilities used across the entire code base
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.mlratings = self.data_extractor.get_mlratings_data()
        self.mlmovies = self.data_extractor.get_mlmovies_data()
        self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data()
        self.genome_tags = self.data_extractor.get_genome_tags_data()

    def get_sorted_actor_ids(self):
        """
        Obtain sorted actor ids
        :return: list of sorted actor ids
        """
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actorids = actor_info.id
        actorids = actorids.sort_values()
        return actorids

    def get_movie_id(self, movie):
        """
        Obtain name ID for the name passed as input
        :param movie:
        :return: movie id
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['moviename'] == movie]
        movie_id = movie_data['movieid'].unique()

        return movie_id[0]

    def get_average_ratings_for_movie(self, movie_id):
        """
        Obtain average rating for movie
        :param movie_id:
        :return: average movie rating
        """
        all_ratings = self.mlratings
        movie_ratings = all_ratings[all_ratings['movieid'] == movie_id]

        ratings_sum = 0
        ratings_count = 0
        for index, row in movie_ratings.iterrows():
            ratings_count += 1
            ratings_sum += row['rating']

        return ratings_sum / float(ratings_count)

    def get_actor_name_for_id(self, actor_id):
        """
        actor name for id
        :param actor_id:
        :return: actor name for the actor id
        """
        actor_data = self.imdb_actor_info[self.imdb_actor_info['id'] == actor_id]
        name = actor_data['name'].unique()

        return name[0]

    def get_movie_name_for_id(self, movieid):
        """
        movie name for movie id
        :param movieid:
        :return: movie name
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['movieid'] == movieid]
        movie_name = movie_data['moviename'].unique()

        return movie_name[0]

    def get_tag_name_for_id(self, tag_id):
        """
        tag name for tag id
        :param tag_id:
        :return: tag name
        """
        tag_data = self.genome_tags[self.genome_tags['tagId'] == tag_id]
        name = tag_data['tag'].unique()

        return name[0]

    def partition_factor_matrix(self, matrix, no_of_partitions, entity_names):
        """
        Function to partition the factor matrix into groups as per 2-norm distance
        :param matrix:
        :param no_of_partitions:
        :param entity_names:
        :return: dictionary containing the groups
        """
        entity_dict = {}
        for i in range(0, len(matrix)):
            length = 0
            for latent_semantic in matrix[i]:
                length += abs(latent_semantic) ** 2
            entity_dict[entity_names[i]] = math.sqrt(length)

        max_length = float(max(entity_dict.values()))
        min_length = float(min(entity_dict.values()))
        length_of_group = (float(max_length) - float(min_length)) / float(no_of_partitions)

        groups = {}
        for i in range(0, no_of_partitions):
            groups["Group " + str(i + 1) + " ( " + str(min_length + float(i * length_of_group)) + " , " + str(
                min_length + float((i + 1) * length_of_group)) + " )"] = []

        for key in entity_dict.keys():
            entity_length = entity_dict[key]
            group_no = math.ceil(float(entity_length - min_length) / float(length_of_group))
            if group_no == 0:
                group_no = 1
            groups["Group " + str(group_no) + " ( " + str(
                min_length + float((group_no - 1) * length_of_group)) + " , " + str(
                min_length + float(group_no * length_of_group)) + " )"].append(key)

        return groups

    def get_latent_semantics(self, r, matrix):
        """
        Function to obtain the latent semantics for the factor matrix
        :param r:
        :param matrix:
        :return: top 'r' latent semantics
        """
        latent_semantics = []
        for latent_semantic in matrix:
            if len(latent_semantics) == r:
                break
            latent_semantics.append(latent_semantic)

        return latent_semantics

    def print_partitioned_entities(self, groupings):
        """
        Pretty print groupings
        :param groupings:
        """
        for key in groupings.keys():
            print(key)
            if len(groupings[key]) == 0:
                print("NO ELEMENTS IN THIS GROUP\n")
                continue
            for entity in groupings[key]:
                print(entity, end="|")
            print("\n")

    def print_latent_semantics(self, latent_semantics, entity_names_list):
        """
        Pretty print latent semantics
        :param latent_semantics:
        :param entity_names_list:
        """
        for latent_semantic in latent_semantics:
            print("Latent Semantic:")
            dict1 = {}
            for i in range(0, len(entity_names_list)):
                dict1[entity_names_list[i]] = float(latent_semantic[i])
            for s in sorted(dict1, key=dict1.get, reverse=True):  # value-based sorting
                print(str(s) + "*(" + str(dict1[s]) + ")", end="")
                print(" + ", end="")
            print("\n")

    def CPDecomposition(self, tensor, rank):
        """
        Perform CP Decomposition
        :param tensor:
        :param rank:
        :return: factor matrices obtained after decomposition
        """
        factors = decomp.parafac(tensor, rank)
        return factors

    def SVD(self, matrix):
        """
        Perform SVD
        :param matrix:
        :return: factor matrices and the core matrix
        """

        U, s, Vh = linalg.svd(matrix, full_matrices=False)
        return (U, s, Vh)

    def PCA(self, matrix):
        """
        Perform PCA
        :param matrix:
        :return: factor matrices and the core matrix
        """

        # Computng covariance matrix
        cov_df = numpy.cov(matrix, rowvar=False)

        # Calculating PCA
        U, s, Vh = linalg.svd(cov_df)
        return (U, s, Vh)

    def LDA(self, input_compound_list, num_topics, num_features):
        """
        Perform LDA
        :param input_compound_list:
        :param num_topics:
        :param num_features:
        :return: topics and object topic distribution
        """
        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(input_compound_list)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in input_compound_list]

        # generate LDA model
        lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20)

        latent_semantics = lda.print_topics(num_topics, num_features)
        # for latent in latent_semantics:
        #     print(latent)

        corpus = lda[corpus]

        # for i in corpus:
        #     print(i)

        return corpus, latent_semantics

    def get_doc_topic_matrix(self, u, num_docs, num_topics):
        """
        Reconstructing data
        :param u:
        :param num_docs:
        :param num_topics:
        :return: reconstructed data
        """
        u_matrix = numpy.zeros(shape=(num_docs, num_topics))

        for i in range(0, len(u)):
            doc = u[i]
            for j in range(0, len(doc)):
                (topic_no, prob) = doc[j]
                u_matrix[i, topic_no] = prob

        return u_matrix

コード例 #3

ファイルを表示

ファイル: phase_2_task_1b.py プロジェクト: abhijithshreesh/MovieRecommenderSystem

class SvdGenreActor(GenreTag):
    """
            Class to relate Genre and Actor, inherits the ActorTag to use the common weighing functons
    """

    def __init__(self):
        """
        Initialiazing the data extractor object to get data from the csv files
        """
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)

    def split_genres(self, data_frame):
        """
        This function extractors genres from each row and converts into independent rows
        :param data_frame:
        :return: data frame with multiple genres split into different rows
        """
        genre_data_frame = data_frame['genres'].str.split('|', expand=True).stack()
        genre_data_frame.name = "genre"
        genre_data_frame.index = genre_data_frame.index.droplevel(-1)
        genre_data_frame = genre_data_frame.reset_index()
        data_frame = data_frame.drop("genres", axis=1)
        data_frame = data_frame.reset_index()
        data_frame = genre_data_frame.merge(data_frame, how="left", on="index")
        return data_frame

    def assign_rank_weight(self, data_frame):
        """
        This function assigns a value for all the actors in a movie on a scale of 100,
         based on their rank in the movie.
        :param tag_series:
        :return: dictionary of (movieid, actor_rank) to the computed rank_weight
        """
        groupby_movies = data_frame.groupby("movieid")
        movie_rank_weight_dict = {}
        for movieid, info_df in groupby_movies:
           max_rank = info_df.actor_movie_rank.max()
           for rank in info_df.actor_movie_rank.unique():
             movie_rank_weight_dict[(movieid, rank)] = (max_rank - rank + 1)/max_rank*100
        return movie_rank_weight_dict

    def assign_idf_weight(self, data_frame, unique_actors):
        """
        This function computes the idf weight for all tags in a data frame,
        considering each movie as a document
        :param data_frame:
        :param unique_tags:
        :return: dictionary of tags and idf weights
        """
        idf_counter = {actorid_string: 0 for actorid_string in unique_actors}
        data_frame.actorid_string = pd.Series([set(actors.split(',')) for actors in data_frame.actorid_string], index=data_frame.index)
        for actor_list in data_frame.actorid_string:
            for actorid_string in actor_list:
                idf_counter[actorid_string] += 1
        for actorid_string, count in list(idf_counter.items()):
            idf_counter[actorid_string] = math.log(len(data_frame.index)/count)
        return idf_counter

    def assign_tf_weight(self, actor_series):
        """
        This function computes the tf weight for all tags for a movie
        :param tag_series:
        :return: dictionary of tags and tf weights
        """
        counter = Counter()
        for each in actor_series:
            counter[each] += 1
        total = sum(counter.values())
        for each in counter:
            counter[each] = (counter[each]/total)
        return dict(counter)

    def get_model_weight(self, tf_weight_dict, idf_weight_dict, rank_weight_dict, actor_df, model):
        """
               This function combines tf_weight on a scale of 100, idf_weight on a scale of 100,
               and timestamp_weight on a scale of 10 , based on the model.
               :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model
               :return: data_frame with column of the combined weight
        """
        if model == "TF":
            actor_df["value"] = pd.Series(
                [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*100) + rank_weight_dict.get((movieid, rank), 0)) for
                 index, ts_weight, actorid_string, movieid, rank
                 in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)],
                index=actor_df.index)
        else:
            actor_df["value"] = pd.Series(
                [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*(idf_weight_dict.get(actorid_string, 0))*100) + rank_weight_dict.get((movieid, rank), 0)) for
                 index, ts_weight, actorid_string, movieid, rank
                 in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)],
                index=actor_df.index)
        return actor_df

    def combine_computed_weights(self, data_frame, rank_weight_dict, model, genre):
        """
                Triggers the weighing process and sums up all the calculated weights for each tag
                :param data_frame:
                :param rank_weight_dict:
                :param model:
                :return: dictionary of tags and weights
        """
        actor_df = data_frame.reset_index()
        temp_df = data_frame[data_frame["genre"]==genre]
        unique_actors = actor_df.actorid_string.unique()
        idf_data = actor_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index()
        tf_df = temp_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index()
        movie_actor_dict = dict(zip(tf_df.movieid, tf_df.actorid_string))
        tf_weight_dict = {movie: self.assign_tf_weight(actorid_string.split(',')) for movie, actorid_string in
                          list(movie_actor_dict.items())}
        idf_weight_dict = {}
        if model != 'TF':
            idf_weight_dict = self.assign_idf_weight(idf_data, unique_actors)
        actor_df = self.get_model_weight(tf_weight_dict, idf_weight_dict, rank_weight_dict, temp_df, model)
        actor_df["total"] = actor_df.groupby(['actorid_string'])['value'].transform('sum')
        actor_df = actor_df.drop_duplicates("actorid_string").sort_values("total", ascending=False)
        #actor_tag_dict = dict(zip(tag_df.tag, tag_df.total))
        return actor_df

    def get_genre_actor_data_frame(self):
        """
        Function to merge mutiple tables and get the required dataframe for tf-idf calculation
        :return: dataframe
        """
        # Getting movie_genre_data
        movie_genre_data_frame = self.data_extractor.get_mlmovies_data()
        movie_genre_data_frame = self.split_genres(movie_genre_data_frame)

        # Getting actor_movie_data
        movie_actor_data_frame = self.data_extractor.get_movie_actor_data()

        genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid", right_on="movieid")
        #genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index()
        genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]]
        genre_actor_frame = genre_actor_frame.sort_values("year", ascending=True)

        data_frame_len = len(genre_actor_frame.index)
        genre_actor_frame["year_weight"] = pd.Series(
            [(index + 1) / data_frame_len * 10 for index in genre_actor_frame.index],
            index=genre_actor_frame.index)

        genre_actor_frame["actorid_string"] = pd.Series(
            [str(id) for id in genre_actor_frame.actorid],
            index = genre_actor_frame.index)

        return genre_actor_frame

    def svd_genre_actor(self, genre):
        """
        Does SVD on movie-actor matrix and outputs movies in terms of latent semantics as U
        and actors in terms of latent semantics as Vh
        :param genre:
        :return: returns U and Vh
        """
        genre_actor_frame = self.get_genre_actor_data_frame()
        rank_weight_dict = self.assign_rank_weight(genre_actor_frame[['movieid', 'actor_movie_rank']])
        genre_actor_frame = self.combine_computed_weights(genre_actor_frame, rank_weight_dict, "TFIDF", genre)
        temp_df = genre_actor_frame[["movieid", "actorid_string", "total"]].drop_duplicates()
        genre_actor_tfidf_df = temp_df.pivot(index='movieid', columns='actorid_string', values='total')
        genre_actor_tfidf_df = genre_actor_tfidf_df.fillna(0)

        genre_actor_tfidf_df.to_csv('genre_actor_matrix.csv', index=True, encoding='utf-8')

        df = pd.DataFrame(pd.read_csv('genre_actor_matrix.csv'))
        df1 = genre_actor_tfidf_df.values[:, :]
        row_headers = list(df["movieid"])
        column_headers = list(df)
        del column_headers[0]

        column_headers_names = []

        for col_head in column_headers:
            col_head_name = util.get_actor_name_for_id(int(col_head))
            column_headers_names = column_headers_names + [col_head_name]

        (U, s, Vh) = util.SVD(df1)

        # To print latent semantics
        latents = util.get_latent_semantics(4, Vh)
        util.print_latent_semantics(latents, column_headers_names)

        u_frame = pd.DataFrame(U[:, :4], index=row_headers)
        v_frame = pd.DataFrame(Vh[:4, :], columns=column_headers)
        u_frame.to_csv('u_1b_svd.csv', index=True, encoding='utf-8')
        v_frame.to_csv('vh_1b_svd.csv', index=True, encoding='utf-8')
        return (u_frame, v_frame, s)

コード例 #4

ファイルを表示

class SimilarActorsFromDiffMoviesLda(object):
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.util = Util()
        self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies()

    def most_similar_actors_lda(self, moviename):
        """
        Function to find related actors from related movies(movie_movie_similarity_matrix using lda)
        corresponding to the given movie
        :param moviename:
        :return: actors
        """
        data_frame = self.data_extractor.get_mlmovies_data()
        tag_data_frame = self.data_extractor.get_genome_tags_data()
        movie_data_frame = self.data_extractor.get_mltags_data()
        movie_tag_data_frame = movie_data_frame.merge(tag_data_frame, how="left", left_on="tagid", right_on="tagId")
        movie_tag_data_frame = movie_tag_data_frame.merge(data_frame, how="left", left_on="movieid", right_on="movieid")
        tag_df = movie_tag_data_frame.groupby(['movieid'])['tag'].apply(list).reset_index()

        tag_df = tag_df.sort_values('movieid')
        movies = tag_df.movieid.tolist()
        tag_df = list(tag_df.iloc[:, 1])

        input_movieid = self.util.get_movie_id(moviename)

        (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=1000)

        movie_topic_matrix = self.util.get_doc_topic_matrix(U, num_docs=len(movies), num_topics=5)
        topic_movie_matrix = movie_topic_matrix.transpose()
        movie_movie_matrix = numpy.dot(movie_topic_matrix, topic_movie_matrix)

        index_movie = None
        for i, j in enumerate(movies):
            if j == input_movieid:
                index_movie = i
                break

        if index_movie == None:
            print("Movie Id not found.")
            return None

        movie_row = movie_movie_matrix[index_movie].tolist()
        movie_movie_dict = dict(zip(movies, movie_row))
        del movie_movie_dict[input_movieid]

        for key in movie_movie_dict.keys():
            movie_movie_dict[key] = abs(movie_movie_dict[key])

        movie_movie_dict = sorted(movie_movie_dict.items(), key=operator.itemgetter(1), reverse=True)

        if movie_movie_dict == None:
            return None
        actors = []
        for (movie, val) in movie_movie_dict:
            if val <= 0:
                break
            actors = actors + self.sim_act_diff_mov_tf.get_actors_of_movie(self.util.get_movie_name_for_id(movie))
            if len(actors) >= 10:
                break

        actors_of_given_movie = self.sim_act_diff_mov_tf.get_actors_of_movie(moviename)

        actorsFinal = [x for x in actors if x not in actors_of_given_movie]

        actornames = []
        for actorid in actorsFinal:
            actor = self.util.get_actor_name_for_id(actorid)
            actornames.append(actor)

        return actornames

コード例 #5

ファイルを表示

ファイル: phase_2_task_2c.py プロジェクト: abhijithshreesh/MovieRecommenderSystem

class ActorMovieYearTensor(object):

    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.ordered_years = []
        self.ordered_movie_names = []
        self.ordered_actor_names = []
        self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
        self.util = Util()
        self.tensor = self.fetchActorMovieYearTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchActorMovieYearTensor(self):
        """
        Create actor movie year tensor
        :return: tensor
        """
        movies_df = self.data_extractor.get_mlmovies_data()
        actor_df = self.data_extractor.get_movie_actor_data()

        movie_actor_df = actor_df.merge(movies_df, how="left", on="movieid")
        year_list = movie_actor_df["year"]
        year_count = 0
        year_dict = {}
        for element in year_list:
            if element in year_dict.keys():
                continue
            year_dict[element] = year_count
            year_count += 1
            self.ordered_years.append(element)

        movieid_list = movie_actor_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        actorid_list = movie_actor_df["actorid"]
        actorid_count = 0
        actorid_dict = {}
        for element in actorid_list:
            if element in actorid_dict.keys():
                continue
            actorid_dict[element] = actorid_count
            actorid_count += 1
            name = self.util.get_actor_name_for_id(element)
            self.ordered_actor_names.append(name)

        tensor = np.zeros((year_count, movieid_count, actorid_count))

        for index, row in movie_actor_df.iterrows():
            year = row["year"]
            movieid = row["movieid"]
            actorid = row["actorid"]
            year_id = year_dict[year]
            movieid_id = movieid_dict[movieid]
            actorid_id = actorid_dict[actorid]
            tensor[year_id][movieid_id][actorid_id] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
        Pretty print latent semantics
        :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics, self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
        Obtain factor names
        :param i:
        :return: factor names
        """
        if i == 0:
            return self.ordered_years
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_actor_names

    def get_partitions(self, no_of_partitions):
        """
        Partition factor matrices
        :param no_of_partitions:
        :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
        Pretty print groupings
        :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1

コード例 #6

ファイルを表示

ファイル: util.py プロジェクト: aditya-chayapathy/movie-data-vector-space-modelling

class Util(object):
    """
    Class containing all the common utilities used across the entire code base
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.mlmovies = self.data_extractor.get_mlmovies_data()
        self.genre_tag = GenreTag()
        self.genre_data = self.genre_tag.get_genre_data()

    def get_movie_id(self, movie):
        """
        Obtain name ID for the name passed as input
        :param movie:
        :return: movie id
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['moviename'] == movie]
        movie_id = movie_data['movieid'].unique()

        return movie_id[0]

    def CPDecomposition(self, tensor, rank):
        """
        Perform CP Decomposition
        :param tensor:
        :param rank:
        :return: factor matrices obtained after decomposition
        """
        (movie_count, genre_count, tag_count) = tensor.shape
        rank = min(rank, movie_count - 1, genre_count - 1, tag_count - 1)
        factors = decomp.parafac(tensor, rank)

        return factors

    def SVD(self, matrix):
        """
        Perform SVD
        :param matrix:
        :return: factor matrices and the core matrix
        """
        U, s, Vh = numpy.linalg.svd(matrix, full_matrices=False)

        return U, s, Vh

    def PCA(self, matrix):
        """
        Perform PCA
        :param matrix:
        :return: factor matrices and the core matrix
        """
        cov_df = numpy.cov(matrix, rowvar=False)
        U, s, Vh = numpy.linalg.svd(cov_df)

        return U, s, Vh

    def LDA(self, input_compound_list, num_topics, num_features):
        """
        Perform LDA
        :param input_compound_list:
        :param num_topics:
        :param num_features:
        :return: topics and object topic distribution
        """
        dictionary = gensim.corpora.Dictionary(input_compound_list)
        corpus = [dictionary.doc2bow(text) for text in input_compound_list]
        lda = gensim.models.ldamodel.LdaModel(corpus,
                                              num_topics,
                                              id2word=dictionary,
                                              passes=20)
        latent_semantics = lda.print_topics(num_topics, num_features)
        corpus = lda[corpus]

        return corpus, latent_semantics

    def get_doc_topic_matrix(self, u, num_docs, num_topics):
        """
        Reconstructing data
        :param u:
        :param num_docs:
        :param num_topics:
        :return: reconstructed data
        """
        u_matrix = numpy.zeros(shape=(num_docs, num_topics))

        for i in range(0, len(u)):
            doc = u[i]
            for j in range(0, len(doc)):
                (topic_no, prob) = doc[j]
                u_matrix[i, topic_no] = prob

        return u_matrix

    def get_transition_dataframe(self, data_frame):
        """
        Function to get the transition matrix for Random walk
        :param data_frame:
        :return: transition matrix
        """
        for column in data_frame:
            data_frame[column] = pd.Series([
                0 if ind == int(column) else each
                for ind, each in zip(data_frame.index, data_frame[column])
            ],
                                           index=data_frame.index)
        data_frame["row_sum"] = data_frame.sum(axis=1)
        for column in data_frame:
            data_frame[column] = pd.Series([
                each / sum if
                (column != "row_sum" and each > 0 and ind != int(column)
                 and sum != 0) else each for ind, each, sum in zip(
                     data_frame.index, data_frame[column], data_frame.row_sum)
            ],
                                           index=data_frame.index)
        data_frame = data_frame.drop(["row_sum"], axis=1)
        data_frame.loc[(data_frame.T == 0).all()] = float(
            1 / (len(data_frame.columns)))
        data_frame = data_frame.transpose()

        return data_frame

    def get_seed_matrix(self, transition_df, seed_nodes, nodes):
        """
        Function to get the Restart matrix for entries in the seed list
        :param transition_df:
        :param seed_nodes:
        :param nodeids:
        :return: seed_matrix
        """
        seed_matrix = [0.0 for each in range(len(transition_df.columns))]
        seed_value_list = self.distribute(seed_nodes,
                                          num_of_seeds_to_recommend=1)
        for each in seed_nodes:
            seed_matrix[list(nodes).index(each)] = seed_value_list[list(
                seed_nodes).index(each)]

        return seed_matrix

    def compute_pagerank(self, seed_nodes, node_matrix, nodes):
        """
        Function to compute the Personalised Pagerank for the given input
        :param seed_actors:
        :param actor_matrix:
        :param actorids:
        :return:
        """
        data_frame = pd.DataFrame(node_matrix)
        transition_df = self.get_transition_dataframe(data_frame)
        seed_matrix = self.get_seed_matrix(transition_df, seed_nodes, nodes)
        result_list = seed_matrix
        temp_list = []
        num_of_iter = 0
        while temp_list != result_list and num_of_iter <= 1000:
            num_of_iter += 1
            temp_list = result_list
            result_list = list(0.85 * numpy.matmul(
                numpy.array(transition_df.values), numpy.array(result_list)) +
                               0.15 * numpy.array(seed_matrix))
        page_rank_dict = {i: j for i, j in zip(nodes, result_list)}
        sorted_rank = sorted(page_rank_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

        return sorted_rank[0:len(seed_nodes) + 5]

    def print_movie_recommendations_and_collect_feedback(
            self, movie_ids, task_no, user_id):
        """
        Interface to obtain relevance feedback
        :param movie_ids: List of movies
        :param task_no: Task from which the interface is called
        :param user_id: user for which the movies are displayed
        """
        if len(movie_ids) == 0:
            print("No movies found.")
            exit(1)

        if task_no in [1, 2]:
            print("Movie recommendations: ")
        elif task_no in [3, 4]:
            print("Nearest movies: ")
        else:
            print("Incorrect task number - " + task_no + "\nAborting...")
            exit(1)

        count = 1
        movie_dict = {}
        for movie_id in movie_ids:
            movie_name = self.get_movie_name_for_id(movie_id)
            print(str(count) + ". " + str(movie_name) + " - " + str(movie_id))
            movie_dict[count] = (movie_name, movie_id)
            count += 1

        done = False
        rel_movies = []
        irrel_movies = []
        while not done:
            movies_list = input(
                "\nPlease enter comma separated ids of the relevant movies: ")
            rel_ids = set(
                movies_list.strip(" ").strip(",").replace(" ", "").split(","))
            while '' in rel_ids:
                rel_ids.remove('')

            incorrect = False
            for item in rel_ids:
                if int(item) not in [
                        num for num in range(1,
                                             len(movie_ids) + 1)
                ]:
                    print("Incorrect movie ID selected.")
                    incorrect = True
                    break
            if incorrect:
                continue

            confirmation = input(
                "Are you sure these are the relevant movies? " +
                str(list(rel_ids)) + " (y/Y/n/N): ")
            if confirmation != "y" and confirmation != "Y":
                continue

            movies_list = input(
                "\nPlease enter comma separated ids of the irrelevant movies: "
            )
            irrel_ids = set(
                movies_list.strip(" ").strip(",").replace(" ", "").split(","))
            while '' in irrel_ids:
                irrel_ids.remove('')

            incorrect = False
            for item in irrel_ids:
                if int(item) not in list(
                        set(list([num for num in range(1,
                                                       len(movie_ids) + 1)])) -
                        set(int(num) for num in rel_ids)):
                    print("Incorrect movie ID selected.")
                    incorrect = True
                    break
            if incorrect:
                continue

            confirmation = input(
                "Are you sure these are the irrelevant movies? " +
                str(list(irrel_ids)) + " (y/Y/n/N): ")
            if confirmation != "y" and confirmation != "Y":
                continue

            done = True
            for item in rel_ids:
                rel_movies.append(movie_dict[int(item)])
            for item in irrel_ids:
                irrel_movies.append(movie_dict[int(item)])

        if task_no == 1 or task_no == 2:
            if not os.path.isfile(self.data_set_loc + "/task2-feedback.csv"):
                df = pd.DataFrame(
                    columns=['movie-name', 'movie-id', 'relevancy', 'user-id'])
            else:
                df = self.data_extractor.get_task2_feedback_data()

            for movie in rel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'relevant',
                        'user-id': user_id
                    },
                    ignore_index=True)
            for movie in irrel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'irrelevant',
                        'user-id': user_id
                    },
                    ignore_index=True)

            df.to_csv(self.data_set_loc + "/task2-feedback.csv", index=False)
        elif task_no == 3 or task_no == 4:
            if not os.path.isfile(self.data_set_loc + "/task4-feedback.csv"):
                df = pd.DataFrame(
                    columns=['movie-name', 'movie-id', 'relevancy'])
            else:
                df = self.data_extractor.get_task4_feedback_data()

            for movie in rel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'relevant'
                    },
                    ignore_index=True)
            for movie in irrel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'irrelevant'
                    },
                    ignore_index=True)

            df.to_csv(self.data_set_loc + "/task4-feedback.csv", index=False)

    def get_distribution_count(self, seed_nodes, num_of_seeds_to_recommend):
        """
        Given the number of seeds to be recommended and the seed_nodes,
        returns the distribution for each seed_node considering order
        :param seed_nodes:
        :param num_of_seeds_to_recommend:
        :return: distribution_list
        """
        seed_value_list = self.distribute(seed_nodes,
                                          num_of_seeds_to_recommend)
        seed_value_list = [round(each) for each in seed_value_list]
        total_count = sum(seed_value_list)
        difference = num_of_seeds_to_recommend - total_count
        if difference > 0:
            for i in range(0, len(seed_value_list)):
                if seed_value_list[i] == 0:
                    seed_value_list[i] = 1
                    difference -= 1
                    if difference == 0:
                        return seed_value_list
            for i in range(0, len(seed_value_list)):
                seed_value_list[i] += 1
                difference -= 1
                if difference == 0:
                    return seed_value_list
        elif difference < 0:
            for i in range(0, len(seed_value_list)):
                if seed_value_list[len(seed_value_list) - 1 - i] != 0:
                    seed_value_list[len(seed_value_list) - 1 - i] -= 1
                    difference += 1
                if difference == 0:
                    return seed_value_list

        return seed_value_list

    def get_movie_tag_matrix(self):
        """
        Function to get movie_tag matrix containing list of tags in each movie
        :return: movie_tag_matrix
        """
        tag_df = self.genre_data
        unique_tags = tag_df.tag_string.unique()
        idf_data = tag_df.groupby(['movieid'])['tag_string'].apply(set)
        tf_df = tag_df.groupby(['movieid'
                                ])['tag_string'].apply(list).reset_index()
        movie_tag_dict = dict(zip(tf_df.movieid, tf_df.tag_string))
        tf_weight_dict = {
            movie: self.genre_tag.assign_tf_weight(tags)
            for movie, tags in list(movie_tag_dict.items())
        }
        idf_weight_dict = self.genre_tag.assign_idf_weight(
            idf_data, unique_tags)
        tag_df = self.genre_tag.get_model_weight(tf_weight_dict,
                                                 idf_weight_dict, tag_df,
                                                 'tfidf')
        tag_df["total"] = tag_df.groupby(['movieid', 'tag_string'
                                          ])['value'].transform('sum')
        temp_df = tag_df[["movieid", "tag_string",
                          "total"]].drop_duplicates().reset_index()
        genre_tag_tfidf_df = temp_df.pivot_table('total', 'movieid',
                                                 'tag_string')
        genre_tag_tfidf_df = genre_tag_tfidf_df.fillna(0)

        return genre_tag_tfidf_df

    def distribute(self, seed_nodes, num_of_seeds_to_recommend):
        """
        Distributes importance among seed_nodes based on order of occurrence
        :param seed_nodes:
        :param num_of_seeds_to_recommend:
        :return: list of size num_of_seeds_to_recommend with distributed values
        """
        seed_value = float(num_of_seeds_to_recommend) / len(seed_nodes)
        seed_value_list = [seed_value for seed in seed_nodes]
        delta = seed_value / len(seed_nodes)
        for i in range(0, len(seed_nodes) - 1):
            seed_value_list[i] = seed_value_list[i] + (len(seed_nodes) - 1 -
                                                       i) * delta
            for j in range(i + 1, len(seed_nodes)):
                seed_value_list[j] = seed_value_list[j] - delta

        return seed_value_list

    def get_movie_name_for_id(self, movieid):
        """
        movie name for movie id
        :param movieid:
        :return: movie name
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['movieid'] == movieid]
        movie_name = movie_data['moviename'].unique()

        return movie_name[0]

    def get_tag_list_for_movie(self, movie):
        """
        Get a tag list for the movie
        :param movie: movie id
        :return: list of tags
        """
        movie_specific_data = self.genre_data[self.genre_data["movieid"] ==
                                              movie]
        tags_list = movie_specific_data["tag_string"].unique()

        return tags_list

    def get_movies_for_tag(self, tag):
        """
        Get the list of movies containing the tag
        :param tag: tag string
        :return: list of movies
        """
        tag_specific_data = self.genre_data[self.genre_data["tag_string"] ==
                                            tag]
        movies_list = tag_specific_data["movieid"].unique()

        return movies_list

    def get_all_movies_for_user(self, user_id):
        """
        Obtain all movies watched by the user
        :param user_id:
        :return: list of movies watched by the user
        """
        user_data = self.genre_data[self.genre_data['userid'] == user_id]
        user_data = user_data.sort_values('timestamp', ascending=False)
        movies = user_data['movieid'].unique()

        return movies

    def get_movies_after_year(self, year):
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['year'] >= year]
        movie_id_list = movie_data['movieid'].unique()

        return movie_id_list

    def get_vector_magnitude(self, vector):
        """
        Calculate the magnitude of the vector
        :param vector:
        :return: length of the vector
        """
        result = 0
        for i in vector:
            result += (i * i)

        return math.sqrt(result)