def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc"))
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.mlratings = self.data_extractor.get_mlratings_data()
     self.mlmovies = self.data_extractor.get_mlmovies_data()
     self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data()
     self.genome_tags = self.data_extractor.get_genome_tags_data()
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.mlmovies = self.data_extractor.get_mlmovies_data()
     self.genre_tag = GenreTag()
     self.genre_data = self.genre_tag.get_genre_data()
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.ordered_years = []
     self.ordered_movie_names = []
     self.ordered_actor_names = []
     self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
     self.util = Util()
     self.tensor = self.fetchActorMovieYearTensor()
     self.factors = self.util.CPDecomposition(self.tensor, 5)
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = self.conf.config_section_mapper("filePath").get(
         "data_set_loc")
     self.data_extractor = DataExtractor(self.data_set_loc)
     self.max_ratings = 5
     self.ordered_ratings = [0, 1, 2, 3, 4, 5]
     self.ordered_movie_names = []
     self.ordered_tag_names = []
     self.print_list = [
         "\n\nFor Tags:", "\n\nFor Movies:", "\n\nFor Ratings:"
     ]
     self.util = Util()
     self.tensor = self.fetchTagMovieRatingTensor()
     self.factors = self.util.CPDecomposition(self.tensor, 5)
Beispiel #5
0
class CoactorCoactorMatrix(object):
    """
    Class to compute the Coactor Matrix which represents the number of movies each pair of actors have acted in, together
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(
            os.path.abspath(os.path.dirname(__file__)),
            self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)

    def fetchCoactorCoactorSimilarityMatrix(self):
        """
        Creates the coactor matrix with all the actors in a given set
        :return: coactor matrix
        """
        movie_actor_df = self.data_extractor.get_movie_actor_data()
        movie_actor_set_df = movie_actor_df.groupby(
            ['actorid'])["movieid"].apply(set).reset_index()
        num_of_actors = len(movie_actor_df.actorid.unique())
        coactor_matrix = [[0] * num_of_actors for i in range(num_of_actors)]
        for index, movie_set in zip(movie_actor_set_df.index,
                                    movie_actor_set_df.movieid):
            for index_2, movie_set_2 in zip(movie_actor_set_df.index,
                                            movie_actor_set_df.movieid):
                if index != index_2:
                    coactor_matrix[index][index_2] = len(
                        movie_set.intersection(movie_set_2))

        numpy.savetxt("coactor_coactor_matrix.csv",
                      coactor_matrix,
                      delimiter=",")
        return coactor_matrix, movie_actor_set_df.actorid.unique()
class Util(object):
    """
    Class containing all the common utilities used across the entire code base
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc"))
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.mlratings = self.data_extractor.get_mlratings_data()
        self.mlmovies = self.data_extractor.get_mlmovies_data()
        self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data()
        self.genome_tags = self.data_extractor.get_genome_tags_data()

    def get_sorted_actor_ids(self):
        """
        Obtain sorted actor ids
        :return: list of sorted actor ids
        """
        actor_info = self.data_extractor.get_imdb_actor_info_data()
        actorids = actor_info.id
        actorids = actorids.sort_values()
        return actorids

    def get_movie_id(self, movie):
        """
        Obtain name ID for the name passed as input
        :param movie:
        :return: movie id
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['moviename'] == movie]
        movie_id = movie_data['movieid'].unique()

        return movie_id[0]

    def get_average_ratings_for_movie(self, movie_id):
        """
        Obtain average rating for movie
        :param movie_id:
        :return: average movie rating
        """
        all_ratings = self.mlratings
        movie_ratings = all_ratings[all_ratings['movieid'] == movie_id]

        ratings_sum = 0
        ratings_count = 0
        for index, row in movie_ratings.iterrows():
            ratings_count += 1
            ratings_sum += row['rating']

        return ratings_sum / float(ratings_count)

    def get_actor_name_for_id(self, actor_id):
        """
        actor name for id
        :param actor_id:
        :return: actor name for the actor id
        """
        actor_data = self.imdb_actor_info[self.imdb_actor_info['id'] == actor_id]
        name = actor_data['name'].unique()

        return name[0]

    def get_movie_name_for_id(self, movieid):
        """
        movie name for movie id
        :param movieid:
        :return: movie name
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['movieid'] == movieid]
        movie_name = movie_data['moviename'].unique()

        return movie_name[0]

    def get_tag_name_for_id(self, tag_id):
        """
        tag name for tag id
        :param tag_id:
        :return: tag name
        """
        tag_data = self.genome_tags[self.genome_tags['tagId'] == tag_id]
        name = tag_data['tag'].unique()

        return name[0]

    def partition_factor_matrix(self, matrix, no_of_partitions, entity_names):
        """
        Function to partition the factor matrix into groups as per 2-norm distance
        :param matrix:
        :param no_of_partitions:
        :param entity_names:
        :return: dictionary containing the groups
        """
        entity_dict = {}
        for i in range(0, len(matrix)):
            length = 0
            for latent_semantic in matrix[i]:
                length += abs(latent_semantic) ** 2
            entity_dict[entity_names[i]] = math.sqrt(length)

        max_length = float(max(entity_dict.values()))
        min_length = float(min(entity_dict.values()))
        length_of_group = (float(max_length) - float(min_length)) / float(no_of_partitions)

        groups = {}
        for i in range(0, no_of_partitions):
            groups["Group " + str(i + 1) + " ( " + str(min_length + float(i * length_of_group)) + " , " + str(
                min_length + float((i + 1) * length_of_group)) + " )"] = []

        for key in entity_dict.keys():
            entity_length = entity_dict[key]
            group_no = math.ceil(float(entity_length - min_length) / float(length_of_group))
            if group_no == 0:
                group_no = 1
            groups["Group " + str(group_no) + " ( " + str(
                min_length + float((group_no - 1) * length_of_group)) + " , " + str(
                min_length + float(group_no * length_of_group)) + " )"].append(key)

        return groups

    def get_latent_semantics(self, r, matrix):
        """
        Function to obtain the latent semantics for the factor matrix
        :param r:
        :param matrix:
        :return: top 'r' latent semantics
        """
        latent_semantics = []
        for latent_semantic in matrix:
            if len(latent_semantics) == r:
                break
            latent_semantics.append(latent_semantic)

        return latent_semantics

    def print_partitioned_entities(self, groupings):
        """
        Pretty print groupings
        :param groupings:
        """
        for key in groupings.keys():
            print(key)
            if len(groupings[key]) == 0:
                print("NO ELEMENTS IN THIS GROUP\n")
                continue
            for entity in groupings[key]:
                print(entity, end="|")
            print("\n")

    def print_latent_semantics(self, latent_semantics, entity_names_list):
        """
        Pretty print latent semantics
        :param latent_semantics:
        :param entity_names_list:
        """
        for latent_semantic in latent_semantics:
            print("Latent Semantic:")
            dict1 = {}
            for i in range(0, len(entity_names_list)):
                dict1[entity_names_list[i]] = float(latent_semantic[i])
            for s in sorted(dict1, key=dict1.get, reverse=True):  # value-based sorting
                print(str(s) + "*(" + str(dict1[s]) + ")", end="")
                print(" + ", end="")
            print("\n")

    def CPDecomposition(self, tensor, rank):
        """
        Perform CP Decomposition
        :param tensor:
        :param rank:
        :return: factor matrices obtained after decomposition
        """
        factors = decomp.parafac(tensor, rank)
        return factors

    def SVD(self, matrix):
        """
        Perform SVD
        :param matrix:
        :return: factor matrices and the core matrix
        """

        U, s, Vh = linalg.svd(matrix, full_matrices=False)
        return (U, s, Vh)

    def PCA(self, matrix):
        """
        Perform PCA
        :param matrix:
        :return: factor matrices and the core matrix
        """

        # Computng covariance matrix
        cov_df = numpy.cov(matrix, rowvar=False)

        # Calculating PCA
        U, s, Vh = linalg.svd(cov_df)
        return (U, s, Vh)

    def LDA(self, input_compound_list, num_topics, num_features):
        """
        Perform LDA
        :param input_compound_list:
        :param num_topics:
        :param num_features:
        :return: topics and object topic distribution
        """
        # turn our tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(input_compound_list)

        # convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in input_compound_list]

        # generate LDA model
        lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20)

        latent_semantics = lda.print_topics(num_topics, num_features)
        # for latent in latent_semantics:
        #     print(latent)

        corpus = lda[corpus]

        # for i in corpus:
        #     print(i)

        return corpus, latent_semantics

    def get_doc_topic_matrix(self, u, num_docs, num_topics):
        """
        Reconstructing data
        :param u:
        :param num_docs:
        :param num_topics:
        :return: reconstructed data
        """
        u_matrix = numpy.zeros(shape=(num_docs, num_topics))

        for i in range(0, len(u)):
            doc = u[i]
            for j in range(0, len(doc)):
                (topic_no, prob) = doc[j]
                u_matrix[i, topic_no] = prob

        return u_matrix
import argparse
import logging
import operator
from util import Util
import numpy
import pandas as pd
from actor_actor_similarity_matrix import ActorActorMatrix
from coactor_coactor_matrix import CoactorCoactorMatrix
from config_parser import ParseConfig
from data_extractor import DataExtractor

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
log.disabled = True

conf = ParseConfig()


class PageRankActor(ActorActorMatrix):
    """Class to calculate Personalised PageRank"""
    def __init__(self):
        super().__init__()
        self.data_set_loc = conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.actor_matrix, self.actorids = self.fetchActorActorSimilarityMatrix(
        )
        self.coactor_obj = CoactorCoactorMatrix()
        self.coactor_matrix, self.coactorids = self.coactor_obj.fetchCoactorCoactorSimilarityMatrix(
        )
        self.util = Util()
class TagMovieRatingTensor(object):
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.max_ratings = 5
        self.ordered_ratings = [0, 1, 2, 3, 4, 5]
        self.ordered_movie_names = []
        self.ordered_tag_names = []
        self.print_list = [
            "\n\nFor Tags:", "\n\nFor Movies:", "\n\nFor Ratings:"
        ]
        self.util = Util()
        self.tensor = self.fetchTagMovieRatingTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchTagMovieRatingTensor(self):
        """
        Create tag movie rating tensor
        :return: tensor
        """
        mltags_df = self.data_extractor.get_mltags_data()

        tag_id_list = mltags_df["tagid"]
        tag_id_count = 0
        tag_id_dict = {}
        for element in tag_id_list:
            if element in tag_id_dict.keys():
                continue
            tag_id_dict[element] = tag_id_count
            tag_id_count += 1
            name = self.util.get_tag_name_for_id(element)
            self.ordered_tag_names.append(name)

        movieid_list = mltags_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        tensor = np.zeros((tag_id_count, movieid_count, self.max_ratings + 1))

        for index, row in mltags_df.iterrows():
            tagid = row["tagid"]
            movieid = row["movieid"]
            avg_movie_rating = self.util.get_average_ratings_for_movie(movieid)
            for rating in range(0, int(avg_movie_rating) + 1):
                tagid_id = tag_id_dict[tagid]
                movieid_id = movieid_dict[movieid]
                tensor[tagid_id][movieid_id][rating] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
                Pretty print latent semantics
                :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(
                r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics,
                                             self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
                Obtain factor names
                :param i:
                :return: factor names
        """
        if i == 0:
            return self.ordered_tag_names
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_ratings

    def get_partitions(self, no_of_partitions):
        """
                Partition factor matrices
                :param no_of_partitions:
                :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(
                factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
                Pretty print groupings
                :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1
Beispiel #9
0
        return self.data_extractor("task2-feedback.csv")

    def get_task4_feedback_data(self):
        return self.data_extractor("task4-feedback.csv")

    def get_movie_latent_semantics_data(self):
        return self.data_extractor("movie_latent_semantic.csv")

    def get_json(self):
        file_loc = os.path.join(self.file_path, "label_movies.json")
        json_movie_label_dict = json.load(open(file_loc))

        return json_movie_label_dict

    def get_relevance_feedback_query_vector(self):
        return self.data_extractor("relevance-feedback-query-vector.csv")

    def get_lsh_details(self):
        return json.load(
            open(os.path.join(self.file_path, 'task_3_details.json')))


if __name__ == "__main__":
    conf = ParseConfig()
    data_set_location = conf.config_section_mapper("filePath").get(
        "data_set_loc")
    extract_data = DataExtractor(data_set_location)
    data_frame = extract_data.data_extractor("mlmovies.csv")
    print("File columns for mlmovies.csv")
    print("Columns = %s" % (data_frame.columns.values))
class ActorMovieYearTensor(object):

    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.ordered_years = []
        self.ordered_movie_names = []
        self.ordered_actor_names = []
        self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"]
        self.util = Util()
        self.tensor = self.fetchActorMovieYearTensor()
        self.factors = self.util.CPDecomposition(self.tensor, 5)

    def fetchActorMovieYearTensor(self):
        """
        Create actor movie year tensor
        :return: tensor
        """
        movies_df = self.data_extractor.get_mlmovies_data()
        actor_df = self.data_extractor.get_movie_actor_data()

        movie_actor_df = actor_df.merge(movies_df, how="left", on="movieid")
        year_list = movie_actor_df["year"]
        year_count = 0
        year_dict = {}
        for element in year_list:
            if element in year_dict.keys():
                continue
            year_dict[element] = year_count
            year_count += 1
            self.ordered_years.append(element)

        movieid_list = movie_actor_df["movieid"]
        movieid_count = 0
        movieid_dict = {}
        for element in movieid_list:
            if element in movieid_dict.keys():
                continue
            movieid_dict[element] = movieid_count
            movieid_count += 1
            name = self.util.get_movie_name_for_id(element)
            self.ordered_movie_names.append(name)

        actorid_list = movie_actor_df["actorid"]
        actorid_count = 0
        actorid_dict = {}
        for element in actorid_list:
            if element in actorid_dict.keys():
                continue
            actorid_dict[element] = actorid_count
            actorid_count += 1
            name = self.util.get_actor_name_for_id(element)
            self.ordered_actor_names.append(name)

        tensor = np.zeros((year_count, movieid_count, actorid_count))

        for index, row in movie_actor_df.iterrows():
            year = row["year"]
            movieid = row["movieid"]
            actorid = row["actorid"]
            year_id = year_dict[year]
            movieid_id = movieid_dict[movieid]
            actorid_id = actorid_dict[actorid]
            tensor[year_id][movieid_id][actorid_id] = 1

        return tensor

    def print_latent_semantics(self, r):
        """
        Pretty print latent semantics
        :param r:
        """
        i = 0
        for factor in self.factors:
            print(self.print_list[i])
            latent_semantics = self.util.get_latent_semantics(r, factor.transpose())
            self.util.print_latent_semantics(latent_semantics, self.get_factor_names(i))
            i += 1

    def get_factor_names(self, i):
        """
        Obtain factor names
        :param i:
        :return: factor names
        """
        if i == 0:
            return self.ordered_years
        elif i == 1:
            return self.ordered_movie_names
        elif i == 2:
            return self.ordered_actor_names

    def get_partitions(self, no_of_partitions):
        """
        Partition factor matrices
        :param no_of_partitions:
        :return: list of groupings
        """
        i = 0
        groupings_list = []
        for factor in self.factors:
            groupings = self.util.partition_factor_matrix(factor, no_of_partitions, self.get_factor_names(i))
            groupings_list.append(groupings)
            i += 1

        return groupings_list

    def print_partitioned_entities(self, no_of_partitions):
        """
        Pretty print groupings
        :param no_of_partitions:
        """
        groupings_list = self.get_partitions(no_of_partitions)
        i = 0
        for groupings in groupings_list:
            print(self.print_list[i])
            self.util.print_partitioned_entities(groupings)
            i += 1
class Util(object):
    """
    Class containing all the common utilities used across the entire code base
    """
    def __init__(self):
        self.conf = ParseConfig()
        self.data_set_loc = self.conf.config_section_mapper("filePath").get(
            "data_set_loc")
        self.data_extractor = DataExtractor(self.data_set_loc)
        self.mlmovies = self.data_extractor.get_mlmovies_data()
        self.genre_tag = GenreTag()
        self.genre_data = self.genre_tag.get_genre_data()

    def get_movie_id(self, movie):
        """
        Obtain name ID for the name passed as input
        :param movie:
        :return: movie id
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['moviename'] == movie]
        movie_id = movie_data['movieid'].unique()

        return movie_id[0]

    def CPDecomposition(self, tensor, rank):
        """
        Perform CP Decomposition
        :param tensor:
        :param rank:
        :return: factor matrices obtained after decomposition
        """
        (movie_count, genre_count, tag_count) = tensor.shape
        rank = min(rank, movie_count - 1, genre_count - 1, tag_count - 1)
        factors = decomp.parafac(tensor, rank)

        return factors

    def SVD(self, matrix):
        """
        Perform SVD
        :param matrix:
        :return: factor matrices and the core matrix
        """
        U, s, Vh = numpy.linalg.svd(matrix, full_matrices=False)

        return U, s, Vh

    def PCA(self, matrix):
        """
        Perform PCA
        :param matrix:
        :return: factor matrices and the core matrix
        """
        cov_df = numpy.cov(matrix, rowvar=False)
        U, s, Vh = numpy.linalg.svd(cov_df)

        return U, s, Vh

    def LDA(self, input_compound_list, num_topics, num_features):
        """
        Perform LDA
        :param input_compound_list:
        :param num_topics:
        :param num_features:
        :return: topics and object topic distribution
        """
        dictionary = gensim.corpora.Dictionary(input_compound_list)
        corpus = [dictionary.doc2bow(text) for text in input_compound_list]
        lda = gensim.models.ldamodel.LdaModel(corpus,
                                              num_topics,
                                              id2word=dictionary,
                                              passes=20)
        latent_semantics = lda.print_topics(num_topics, num_features)
        corpus = lda[corpus]

        return corpus, latent_semantics

    def get_doc_topic_matrix(self, u, num_docs, num_topics):
        """
        Reconstructing data
        :param u:
        :param num_docs:
        :param num_topics:
        :return: reconstructed data
        """
        u_matrix = numpy.zeros(shape=(num_docs, num_topics))

        for i in range(0, len(u)):
            doc = u[i]
            for j in range(0, len(doc)):
                (topic_no, prob) = doc[j]
                u_matrix[i, topic_no] = prob

        return u_matrix

    def get_transition_dataframe(self, data_frame):
        """
        Function to get the transition matrix for Random walk
        :param data_frame:
        :return: transition matrix
        """
        for column in data_frame:
            data_frame[column] = pd.Series([
                0 if ind == int(column) else each
                for ind, each in zip(data_frame.index, data_frame[column])
            ],
                                           index=data_frame.index)
        data_frame["row_sum"] = data_frame.sum(axis=1)
        for column in data_frame:
            data_frame[column] = pd.Series([
                each / sum if
                (column != "row_sum" and each > 0 and ind != int(column)
                 and sum != 0) else each for ind, each, sum in zip(
                     data_frame.index, data_frame[column], data_frame.row_sum)
            ],
                                           index=data_frame.index)
        data_frame = data_frame.drop(["row_sum"], axis=1)
        data_frame.loc[(data_frame.T == 0).all()] = float(
            1 / (len(data_frame.columns)))
        data_frame = data_frame.transpose()

        return data_frame

    def get_seed_matrix(self, transition_df, seed_nodes, nodes):
        """
        Function to get the Restart matrix for entries in the seed list
        :param transition_df:
        :param seed_nodes:
        :param nodeids:
        :return: seed_matrix
        """
        seed_matrix = [0.0 for each in range(len(transition_df.columns))]
        seed_value_list = self.distribute(seed_nodes,
                                          num_of_seeds_to_recommend=1)
        for each in seed_nodes:
            seed_matrix[list(nodes).index(each)] = seed_value_list[list(
                seed_nodes).index(each)]

        return seed_matrix

    def compute_pagerank(self, seed_nodes, node_matrix, nodes):
        """
        Function to compute the Personalised Pagerank for the given input
        :param seed_actors:
        :param actor_matrix:
        :param actorids:
        :return:
        """
        data_frame = pd.DataFrame(node_matrix)
        transition_df = self.get_transition_dataframe(data_frame)
        seed_matrix = self.get_seed_matrix(transition_df, seed_nodes, nodes)
        result_list = seed_matrix
        temp_list = []
        num_of_iter = 0
        while temp_list != result_list and num_of_iter <= 1000:
            num_of_iter += 1
            temp_list = result_list
            result_list = list(0.85 * numpy.matmul(
                numpy.array(transition_df.values), numpy.array(result_list)) +
                               0.15 * numpy.array(seed_matrix))
        page_rank_dict = {i: j for i, j in zip(nodes, result_list)}
        sorted_rank = sorted(page_rank_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

        return sorted_rank[0:len(seed_nodes) + 5]

    def print_movie_recommendations_and_collect_feedback(
            self, movie_ids, task_no, user_id):
        """
        Interface to obtain relevance feedback
        :param movie_ids: List of movies
        :param task_no: Task from which the interface is called
        :param user_id: user for which the movies are displayed
        """
        if len(movie_ids) == 0:
            print("No movies found.")
            exit(1)

        if task_no in [1, 2]:
            print("Movie recommendations: ")
        elif task_no in [3, 4]:
            print("Nearest movies: ")
        else:
            print("Incorrect task number - " + task_no + "\nAborting...")
            exit(1)

        count = 1
        movie_dict = {}
        for movie_id in movie_ids:
            movie_name = self.get_movie_name_for_id(movie_id)
            print(str(count) + ". " + str(movie_name) + " - " + str(movie_id))
            movie_dict[count] = (movie_name, movie_id)
            count += 1

        done = False
        rel_movies = []
        irrel_movies = []
        while not done:
            movies_list = input(
                "\nPlease enter comma separated ids of the relevant movies: ")
            rel_ids = set(
                movies_list.strip(" ").strip(",").replace(" ", "").split(","))
            while '' in rel_ids:
                rel_ids.remove('')

            incorrect = False
            for item in rel_ids:
                if int(item) not in [
                        num for num in range(1,
                                             len(movie_ids) + 1)
                ]:
                    print("Incorrect movie ID selected.")
                    incorrect = True
                    break
            if incorrect:
                continue

            confirmation = input(
                "Are you sure these are the relevant movies? " +
                str(list(rel_ids)) + " (y/Y/n/N): ")
            if confirmation != "y" and confirmation != "Y":
                continue

            movies_list = input(
                "\nPlease enter comma separated ids of the irrelevant movies: "
            )
            irrel_ids = set(
                movies_list.strip(" ").strip(",").replace(" ", "").split(","))
            while '' in irrel_ids:
                irrel_ids.remove('')

            incorrect = False
            for item in irrel_ids:
                if int(item) not in list(
                        set(list([num for num in range(1,
                                                       len(movie_ids) + 1)])) -
                        set(int(num) for num in rel_ids)):
                    print("Incorrect movie ID selected.")
                    incorrect = True
                    break
            if incorrect:
                continue

            confirmation = input(
                "Are you sure these are the irrelevant movies? " +
                str(list(irrel_ids)) + " (y/Y/n/N): ")
            if confirmation != "y" and confirmation != "Y":
                continue

            done = True
            for item in rel_ids:
                rel_movies.append(movie_dict[int(item)])
            for item in irrel_ids:
                irrel_movies.append(movie_dict[int(item)])

        if task_no == 1 or task_no == 2:
            if not os.path.isfile(self.data_set_loc + "/task2-feedback.csv"):
                df = pd.DataFrame(
                    columns=['movie-name', 'movie-id', 'relevancy', 'user-id'])
            else:
                df = self.data_extractor.get_task2_feedback_data()

            for movie in rel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'relevant',
                        'user-id': user_id
                    },
                    ignore_index=True)
            for movie in irrel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'irrelevant',
                        'user-id': user_id
                    },
                    ignore_index=True)

            df.to_csv(self.data_set_loc + "/task2-feedback.csv", index=False)
        elif task_no == 3 or task_no == 4:
            if not os.path.isfile(self.data_set_loc + "/task4-feedback.csv"):
                df = pd.DataFrame(
                    columns=['movie-name', 'movie-id', 'relevancy'])
            else:
                df = self.data_extractor.get_task4_feedback_data()

            for movie in rel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'relevant'
                    },
                    ignore_index=True)
            for movie in irrel_movies:
                df = df.append(
                    {
                        'movie-name': movie[0],
                        'movie-id': movie[1],
                        'relevancy': 'irrelevant'
                    },
                    ignore_index=True)

            df.to_csv(self.data_set_loc + "/task4-feedback.csv", index=False)

    def get_distribution_count(self, seed_nodes, num_of_seeds_to_recommend):
        """
        Given the number of seeds to be recommended and the seed_nodes,
        returns the distribution for each seed_node considering order
        :param seed_nodes:
        :param num_of_seeds_to_recommend:
        :return: distribution_list
        """
        seed_value_list = self.distribute(seed_nodes,
                                          num_of_seeds_to_recommend)
        seed_value_list = [round(each) for each in seed_value_list]
        total_count = sum(seed_value_list)
        difference = num_of_seeds_to_recommend - total_count
        if difference > 0:
            for i in range(0, len(seed_value_list)):
                if seed_value_list[i] == 0:
                    seed_value_list[i] = 1
                    difference -= 1
                    if difference == 0:
                        return seed_value_list
            for i in range(0, len(seed_value_list)):
                seed_value_list[i] += 1
                difference -= 1
                if difference == 0:
                    return seed_value_list
        elif difference < 0:
            for i in range(0, len(seed_value_list)):
                if seed_value_list[len(seed_value_list) - 1 - i] != 0:
                    seed_value_list[len(seed_value_list) - 1 - i] -= 1
                    difference += 1
                if difference == 0:
                    return seed_value_list

        return seed_value_list

    def get_movie_tag_matrix(self):
        """
        Function to get movie_tag matrix containing list of tags in each movie
        :return: movie_tag_matrix
        """
        tag_df = self.genre_data
        unique_tags = tag_df.tag_string.unique()
        idf_data = tag_df.groupby(['movieid'])['tag_string'].apply(set)
        tf_df = tag_df.groupby(['movieid'
                                ])['tag_string'].apply(list).reset_index()
        movie_tag_dict = dict(zip(tf_df.movieid, tf_df.tag_string))
        tf_weight_dict = {
            movie: self.genre_tag.assign_tf_weight(tags)
            for movie, tags in list(movie_tag_dict.items())
        }
        idf_weight_dict = self.genre_tag.assign_idf_weight(
            idf_data, unique_tags)
        tag_df = self.genre_tag.get_model_weight(tf_weight_dict,
                                                 idf_weight_dict, tag_df,
                                                 'tfidf')
        tag_df["total"] = tag_df.groupby(['movieid', 'tag_string'
                                          ])['value'].transform('sum')
        temp_df = tag_df[["movieid", "tag_string",
                          "total"]].drop_duplicates().reset_index()
        genre_tag_tfidf_df = temp_df.pivot_table('total', 'movieid',
                                                 'tag_string')
        genre_tag_tfidf_df = genre_tag_tfidf_df.fillna(0)

        return genre_tag_tfidf_df

    def distribute(self, seed_nodes, num_of_seeds_to_recommend):
        """
        Distributes importance among seed_nodes based on order of occurrence
        :param seed_nodes:
        :param num_of_seeds_to_recommend:
        :return: list of size num_of_seeds_to_recommend with distributed values
        """
        seed_value = float(num_of_seeds_to_recommend) / len(seed_nodes)
        seed_value_list = [seed_value for seed in seed_nodes]
        delta = seed_value / len(seed_nodes)
        for i in range(0, len(seed_nodes) - 1):
            seed_value_list[i] = seed_value_list[i] + (len(seed_nodes) - 1 -
                                                       i) * delta
            for j in range(i + 1, len(seed_nodes)):
                seed_value_list[j] = seed_value_list[j] - delta

        return seed_value_list

    def get_movie_name_for_id(self, movieid):
        """
        movie name for movie id
        :param movieid:
        :return: movie name
        """
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['movieid'] == movieid]
        movie_name = movie_data['moviename'].unique()

        return movie_name[0]

    def get_tag_list_for_movie(self, movie):
        """
        Get a tag list for the movie
        :param movie: movie id
        :return: list of tags
        """
        movie_specific_data = self.genre_data[self.genre_data["movieid"] ==
                                              movie]
        tags_list = movie_specific_data["tag_string"].unique()

        return tags_list

    def get_movies_for_tag(self, tag):
        """
        Get the list of movies containing the tag
        :param tag: tag string
        :return: list of movies
        """
        tag_specific_data = self.genre_data[self.genre_data["tag_string"] ==
                                            tag]
        movies_list = tag_specific_data["movieid"].unique()

        return movies_list

    def get_all_movies_for_user(self, user_id):
        """
        Obtain all movies watched by the user
        :param user_id:
        :return: list of movies watched by the user
        """
        user_data = self.genre_data[self.genre_data['userid'] == user_id]
        user_data = user_data.sort_values('timestamp', ascending=False)
        movies = user_data['movieid'].unique()

        return movies

    def get_movies_after_year(self, year):
        all_movie_data = self.mlmovies
        movie_data = all_movie_data[all_movie_data['year'] >= year]
        movie_id_list = movie_data['movieid'].unique()

        return movie_id_list

    def get_vector_magnitude(self, vector):
        """
        Calculate the magnitude of the vector
        :param vector:
        :return: length of the vector
        """
        result = 0
        for i in vector:
            result += (i * i)

        return math.sqrt(result)
Beispiel #12
0
 def __init__(self):
     self.conf = ParseConfig()
     self.data_set_loc = os.path.join(
         os.path.abspath(os.path.dirname(__file__)),
         self.conf.config_section_mapper("filePath").get("data_set_loc"))
     self.data_extractor = DataExtractor(self.data_set_loc)