class LdaGenreActor(GenreTag): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) def get_lda_data(self, genre): """ Does LDA on movie-actor counts and outputs movies in terms of latent semantics as U and actor in terms of latent semantics as Vh :param genre: :return: returns U and Vh """ # Getting movie_genre_data movie_genre_data_frame = self.data_extractor.get_mlmovies_data() movie_genre_data_frame = self.split_genres(movie_genre_data_frame) # Getting actor_movie_data movie_actor_data_frame = self.data_extractor.get_movie_actor_data() genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid", right_on="movieid") # genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index() genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]] genre_actor_frame["actorid_string"] = pd.Series( [str(id) for id in genre_actor_frame.actorid], index=genre_actor_frame.index) genre_data_frame = genre_actor_frame[genre_actor_frame["genre"]==genre] actor_df = genre_data_frame.groupby(['movieid'])['actorid_string'].apply(list).reset_index() actor_df = actor_df.sort_values('movieid') actor_df.to_csv('movie_actor_lda.csv', index=True, encoding='utf-8') actor_df = list(actor_df.iloc[:,1]) (U, Vh) = util.LDA(actor_df, num_topics=4, num_features=1000) for latent in Vh: print ("\n") print(latent)
class Util(object): """ Class containing all the common utilities used across the entire code base """ def __init__(self): self.conf = ParseConfig() self.data_set_loc = os.path.join(os.path.abspath(os.path.dirname(__file__)), self.conf.config_section_mapper("filePath").get("data_set_loc")) self.data_extractor = DataExtractor(self.data_set_loc) self.mlratings = self.data_extractor.get_mlratings_data() self.mlmovies = self.data_extractor.get_mlmovies_data() self.imdb_actor_info = self.data_extractor.get_imdb_actor_info_data() self.genome_tags = self.data_extractor.get_genome_tags_data() def get_sorted_actor_ids(self): """ Obtain sorted actor ids :return: list of sorted actor ids """ actor_info = self.data_extractor.get_imdb_actor_info_data() actorids = actor_info.id actorids = actorids.sort_values() return actorids def get_movie_id(self, movie): """ Obtain name ID for the name passed as input :param movie: :return: movie id """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['moviename'] == movie] movie_id = movie_data['movieid'].unique() return movie_id[0] def get_average_ratings_for_movie(self, movie_id): """ Obtain average rating for movie :param movie_id: :return: average movie rating """ all_ratings = self.mlratings movie_ratings = all_ratings[all_ratings['movieid'] == movie_id] ratings_sum = 0 ratings_count = 0 for index, row in movie_ratings.iterrows(): ratings_count += 1 ratings_sum += row['rating'] return ratings_sum / float(ratings_count) def get_actor_name_for_id(self, actor_id): """ actor name for id :param actor_id: :return: actor name for the actor id """ actor_data = self.imdb_actor_info[self.imdb_actor_info['id'] == actor_id] name = actor_data['name'].unique() return name[0] def get_movie_name_for_id(self, movieid): """ movie name for movie id :param movieid: :return: movie name """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['movieid'] == movieid] movie_name = movie_data['moviename'].unique() return movie_name[0] def get_tag_name_for_id(self, tag_id): """ tag name for tag id :param tag_id: :return: tag name """ tag_data = self.genome_tags[self.genome_tags['tagId'] == tag_id] name = tag_data['tag'].unique() return name[0] def partition_factor_matrix(self, matrix, no_of_partitions, entity_names): """ Function to partition the factor matrix into groups as per 2-norm distance :param matrix: :param no_of_partitions: :param entity_names: :return: dictionary containing the groups """ entity_dict = {} for i in range(0, len(matrix)): length = 0 for latent_semantic in matrix[i]: length += abs(latent_semantic) ** 2 entity_dict[entity_names[i]] = math.sqrt(length) max_length = float(max(entity_dict.values())) min_length = float(min(entity_dict.values())) length_of_group = (float(max_length) - float(min_length)) / float(no_of_partitions) groups = {} for i in range(0, no_of_partitions): groups["Group " + str(i + 1) + " ( " + str(min_length + float(i * length_of_group)) + " , " + str( min_length + float((i + 1) * length_of_group)) + " )"] = [] for key in entity_dict.keys(): entity_length = entity_dict[key] group_no = math.ceil(float(entity_length - min_length) / float(length_of_group)) if group_no == 0: group_no = 1 groups["Group " + str(group_no) + " ( " + str( min_length + float((group_no - 1) * length_of_group)) + " , " + str( min_length + float(group_no * length_of_group)) + " )"].append(key) return groups def get_latent_semantics(self, r, matrix): """ Function to obtain the latent semantics for the factor matrix :param r: :param matrix: :return: top 'r' latent semantics """ latent_semantics = [] for latent_semantic in matrix: if len(latent_semantics) == r: break latent_semantics.append(latent_semantic) return latent_semantics def print_partitioned_entities(self, groupings): """ Pretty print groupings :param groupings: """ for key in groupings.keys(): print(key) if len(groupings[key]) == 0: print("NO ELEMENTS IN THIS GROUP\n") continue for entity in groupings[key]: print(entity, end="|") print("\n") def print_latent_semantics(self, latent_semantics, entity_names_list): """ Pretty print latent semantics :param latent_semantics: :param entity_names_list: """ for latent_semantic in latent_semantics: print("Latent Semantic:") dict1 = {} for i in range(0, len(entity_names_list)): dict1[entity_names_list[i]] = float(latent_semantic[i]) for s in sorted(dict1, key=dict1.get, reverse=True): # value-based sorting print(str(s) + "*(" + str(dict1[s]) + ")", end="") print(" + ", end="") print("\n") def CPDecomposition(self, tensor, rank): """ Perform CP Decomposition :param tensor: :param rank: :return: factor matrices obtained after decomposition """ factors = decomp.parafac(tensor, rank) return factors def SVD(self, matrix): """ Perform SVD :param matrix: :return: factor matrices and the core matrix """ U, s, Vh = linalg.svd(matrix, full_matrices=False) return (U, s, Vh) def PCA(self, matrix): """ Perform PCA :param matrix: :return: factor matrices and the core matrix """ # Computng covariance matrix cov_df = numpy.cov(matrix, rowvar=False) # Calculating PCA U, s, Vh = linalg.svd(cov_df) return (U, s, Vh) def LDA(self, input_compound_list, num_topics, num_features): """ Perform LDA :param input_compound_list: :param num_topics: :param num_features: :return: topics and object topic distribution """ # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(input_compound_list) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in input_compound_list] # generate LDA model lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20) latent_semantics = lda.print_topics(num_topics, num_features) # for latent in latent_semantics: # print(latent) corpus = lda[corpus] # for i in corpus: # print(i) return corpus, latent_semantics def get_doc_topic_matrix(self, u, num_docs, num_topics): """ Reconstructing data :param u: :param num_docs: :param num_topics: :return: reconstructed data """ u_matrix = numpy.zeros(shape=(num_docs, num_topics)) for i in range(0, len(u)): doc = u[i] for j in range(0, len(doc)): (topic_no, prob) = doc[j] u_matrix[i, topic_no] = prob return u_matrix
class SvdGenreActor(GenreTag): """ Class to relate Genre and Actor, inherits the ActorTag to use the common weighing functons """ def __init__(self): """ Initialiazing the data extractor object to get data from the csv files """ self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) def split_genres(self, data_frame): """ This function extractors genres from each row and converts into independent rows :param data_frame: :return: data frame with multiple genres split into different rows """ genre_data_frame = data_frame['genres'].str.split('|', expand=True).stack() genre_data_frame.name = "genre" genre_data_frame.index = genre_data_frame.index.droplevel(-1) genre_data_frame = genre_data_frame.reset_index() data_frame = data_frame.drop("genres", axis=1) data_frame = data_frame.reset_index() data_frame = genre_data_frame.merge(data_frame, how="left", on="index") return data_frame def assign_rank_weight(self, data_frame): """ This function assigns a value for all the actors in a movie on a scale of 100, based on their rank in the movie. :param tag_series: :return: dictionary of (movieid, actor_rank) to the computed rank_weight """ groupby_movies = data_frame.groupby("movieid") movie_rank_weight_dict = {} for movieid, info_df in groupby_movies: max_rank = info_df.actor_movie_rank.max() for rank in info_df.actor_movie_rank.unique(): movie_rank_weight_dict[(movieid, rank)] = (max_rank - rank + 1)/max_rank*100 return movie_rank_weight_dict def assign_idf_weight(self, data_frame, unique_actors): """ This function computes the idf weight for all tags in a data frame, considering each movie as a document :param data_frame: :param unique_tags: :return: dictionary of tags and idf weights """ idf_counter = {actorid_string: 0 for actorid_string in unique_actors} data_frame.actorid_string = pd.Series([set(actors.split(',')) for actors in data_frame.actorid_string], index=data_frame.index) for actor_list in data_frame.actorid_string: for actorid_string in actor_list: idf_counter[actorid_string] += 1 for actorid_string, count in list(idf_counter.items()): idf_counter[actorid_string] = math.log(len(data_frame.index)/count) return idf_counter def assign_tf_weight(self, actor_series): """ This function computes the tf weight for all tags for a movie :param tag_series: :return: dictionary of tags and tf weights """ counter = Counter() for each in actor_series: counter[each] += 1 total = sum(counter.values()) for each in counter: counter[each] = (counter[each]/total) return dict(counter) def get_model_weight(self, tf_weight_dict, idf_weight_dict, rank_weight_dict, actor_df, model): """ This function combines tf_weight on a scale of 100, idf_weight on a scale of 100, and timestamp_weight on a scale of 10 , based on the model. :param tf_weight_dict, idf_weight_dict, rank_weight_dict, tag_df, model :return: data_frame with column of the combined weight """ if model == "TF": actor_df["value"] = pd.Series( [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*100) + rank_weight_dict.get((movieid, rank), 0)) for index, ts_weight, actorid_string, movieid, rank in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)], index=actor_df.index) else: actor_df["value"] = pd.Series( [(ts_weight + (tf_weight_dict.get(movieid, 0).get(actorid_string, 0)*(idf_weight_dict.get(actorid_string, 0))*100) + rank_weight_dict.get((movieid, rank), 0)) for index, ts_weight, actorid_string, movieid, rank in zip(actor_df.index, actor_df.year_weight, actor_df.actorid_string, actor_df.movieid, actor_df.actor_movie_rank)], index=actor_df.index) return actor_df def combine_computed_weights(self, data_frame, rank_weight_dict, model, genre): """ Triggers the weighing process and sums up all the calculated weights for each tag :param data_frame: :param rank_weight_dict: :param model: :return: dictionary of tags and weights """ actor_df = data_frame.reset_index() temp_df = data_frame[data_frame["genre"]==genre] unique_actors = actor_df.actorid_string.unique() idf_data = actor_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index() tf_df = temp_df.groupby(['movieid'])['actorid_string'].apply(lambda x: ','.join(x)).reset_index() movie_actor_dict = dict(zip(tf_df.movieid, tf_df.actorid_string)) tf_weight_dict = {movie: self.assign_tf_weight(actorid_string.split(',')) for movie, actorid_string in list(movie_actor_dict.items())} idf_weight_dict = {} if model != 'TF': idf_weight_dict = self.assign_idf_weight(idf_data, unique_actors) actor_df = self.get_model_weight(tf_weight_dict, idf_weight_dict, rank_weight_dict, temp_df, model) actor_df["total"] = actor_df.groupby(['actorid_string'])['value'].transform('sum') actor_df = actor_df.drop_duplicates("actorid_string").sort_values("total", ascending=False) #actor_tag_dict = dict(zip(tag_df.tag, tag_df.total)) return actor_df def get_genre_actor_data_frame(self): """ Function to merge mutiple tables and get the required dataframe for tf-idf calculation :return: dataframe """ # Getting movie_genre_data movie_genre_data_frame = self.data_extractor.get_mlmovies_data() movie_genre_data_frame = self.split_genres(movie_genre_data_frame) # Getting actor_movie_data movie_actor_data_frame = self.data_extractor.get_movie_actor_data() genre_actor_frame = movie_genre_data_frame.merge(movie_actor_data_frame, how="left", left_on="movieid", right_on="movieid") #genre_actor_frame = genre_actor_frame[genre_actor_frame['year'].notnull()].reset_index() genre_actor_frame = genre_actor_frame[["movieid", "year", "genre", "actorid", "actor_movie_rank"]] genre_actor_frame = genre_actor_frame.sort_values("year", ascending=True) data_frame_len = len(genre_actor_frame.index) genre_actor_frame["year_weight"] = pd.Series( [(index + 1) / data_frame_len * 10 for index in genre_actor_frame.index], index=genre_actor_frame.index) genre_actor_frame["actorid_string"] = pd.Series( [str(id) for id in genre_actor_frame.actorid], index = genre_actor_frame.index) return genre_actor_frame def svd_genre_actor(self, genre): """ Does SVD on movie-actor matrix and outputs movies in terms of latent semantics as U and actors in terms of latent semantics as Vh :param genre: :return: returns U and Vh """ genre_actor_frame = self.get_genre_actor_data_frame() rank_weight_dict = self.assign_rank_weight(genre_actor_frame[['movieid', 'actor_movie_rank']]) genre_actor_frame = self.combine_computed_weights(genre_actor_frame, rank_weight_dict, "TFIDF", genre) temp_df = genre_actor_frame[["movieid", "actorid_string", "total"]].drop_duplicates() genre_actor_tfidf_df = temp_df.pivot(index='movieid', columns='actorid_string', values='total') genre_actor_tfidf_df = genre_actor_tfidf_df.fillna(0) genre_actor_tfidf_df.to_csv('genre_actor_matrix.csv', index=True, encoding='utf-8') df = pd.DataFrame(pd.read_csv('genre_actor_matrix.csv')) df1 = genre_actor_tfidf_df.values[:, :] row_headers = list(df["movieid"]) column_headers = list(df) del column_headers[0] column_headers_names = [] for col_head in column_headers: col_head_name = util.get_actor_name_for_id(int(col_head)) column_headers_names = column_headers_names + [col_head_name] (U, s, Vh) = util.SVD(df1) # To print latent semantics latents = util.get_latent_semantics(4, Vh) util.print_latent_semantics(latents, column_headers_names) u_frame = pd.DataFrame(U[:, :4], index=row_headers) v_frame = pd.DataFrame(Vh[:4, :], columns=column_headers) u_frame.to_csv('u_1b_svd.csv', index=True, encoding='utf-8') v_frame.to_csv('vh_1b_svd.csv', index=True, encoding='utf-8') return (u_frame, v_frame, s)
class SimilarActorsFromDiffMoviesLda(object): def __init__(self): super().__init__() self.data_set_loc = conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.util = Util() self.sim_act_diff_mov_tf = SimilarActorsFromDiffMovies() def most_similar_actors_lda(self, moviename): """ Function to find related actors from related movies(movie_movie_similarity_matrix using lda) corresponding to the given movie :param moviename: :return: actors """ data_frame = self.data_extractor.get_mlmovies_data() tag_data_frame = self.data_extractor.get_genome_tags_data() movie_data_frame = self.data_extractor.get_mltags_data() movie_tag_data_frame = movie_data_frame.merge(tag_data_frame, how="left", left_on="tagid", right_on="tagId") movie_tag_data_frame = movie_tag_data_frame.merge(data_frame, how="left", left_on="movieid", right_on="movieid") tag_df = movie_tag_data_frame.groupby(['movieid'])['tag'].apply(list).reset_index() tag_df = tag_df.sort_values('movieid') movies = tag_df.movieid.tolist() tag_df = list(tag_df.iloc[:, 1]) input_movieid = self.util.get_movie_id(moviename) (U, Vh) = self.util.LDA(tag_df, num_topics=5, num_features=1000) movie_topic_matrix = self.util.get_doc_topic_matrix(U, num_docs=len(movies), num_topics=5) topic_movie_matrix = movie_topic_matrix.transpose() movie_movie_matrix = numpy.dot(movie_topic_matrix, topic_movie_matrix) index_movie = None for i, j in enumerate(movies): if j == input_movieid: index_movie = i break if index_movie == None: print("Movie Id not found.") return None movie_row = movie_movie_matrix[index_movie].tolist() movie_movie_dict = dict(zip(movies, movie_row)) del movie_movie_dict[input_movieid] for key in movie_movie_dict.keys(): movie_movie_dict[key] = abs(movie_movie_dict[key]) movie_movie_dict = sorted(movie_movie_dict.items(), key=operator.itemgetter(1), reverse=True) if movie_movie_dict == None: return None actors = [] for (movie, val) in movie_movie_dict: if val <= 0: break actors = actors + self.sim_act_diff_mov_tf.get_actors_of_movie(self.util.get_movie_name_for_id(movie)) if len(actors) >= 10: break actors_of_given_movie = self.sim_act_diff_mov_tf.get_actors_of_movie(moviename) actorsFinal = [x for x in actors if x not in actors_of_given_movie] actornames = [] for actorid in actorsFinal: actor = self.util.get_actor_name_for_id(actorid) actornames.append(actor) return actornames
class ActorMovieYearTensor(object): def __init__(self): self.conf = ParseConfig() self.data_set_loc = self.conf.config_section_mapper("filePath").get("data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.ordered_years = [] self.ordered_movie_names = [] self.ordered_actor_names = [] self.print_list = ["\n\nFor Years:", "\n\nFor Movies:", "\n\nFor Actors:"] self.util = Util() self.tensor = self.fetchActorMovieYearTensor() self.factors = self.util.CPDecomposition(self.tensor, 5) def fetchActorMovieYearTensor(self): """ Create actor movie year tensor :return: tensor """ movies_df = self.data_extractor.get_mlmovies_data() actor_df = self.data_extractor.get_movie_actor_data() movie_actor_df = actor_df.merge(movies_df, how="left", on="movieid") year_list = movie_actor_df["year"] year_count = 0 year_dict = {} for element in year_list: if element in year_dict.keys(): continue year_dict[element] = year_count year_count += 1 self.ordered_years.append(element) movieid_list = movie_actor_df["movieid"] movieid_count = 0 movieid_dict = {} for element in movieid_list: if element in movieid_dict.keys(): continue movieid_dict[element] = movieid_count movieid_count += 1 name = self.util.get_movie_name_for_id(element) self.ordered_movie_names.append(name) actorid_list = movie_actor_df["actorid"] actorid_count = 0 actorid_dict = {} for element in actorid_list: if element in actorid_dict.keys(): continue actorid_dict[element] = actorid_count actorid_count += 1 name = self.util.get_actor_name_for_id(element) self.ordered_actor_names.append(name) tensor = np.zeros((year_count, movieid_count, actorid_count)) for index, row in movie_actor_df.iterrows(): year = row["year"] movieid = row["movieid"] actorid = row["actorid"] year_id = year_dict[year] movieid_id = movieid_dict[movieid] actorid_id = actorid_dict[actorid] tensor[year_id][movieid_id][actorid_id] = 1 return tensor def print_latent_semantics(self, r): """ Pretty print latent semantics :param r: """ i = 0 for factor in self.factors: print(self.print_list[i]) latent_semantics = self.util.get_latent_semantics(r, factor.transpose()) self.util.print_latent_semantics(latent_semantics, self.get_factor_names(i)) i += 1 def get_factor_names(self, i): """ Obtain factor names :param i: :return: factor names """ if i == 0: return self.ordered_years elif i == 1: return self.ordered_movie_names elif i == 2: return self.ordered_actor_names def get_partitions(self, no_of_partitions): """ Partition factor matrices :param no_of_partitions: :return: list of groupings """ i = 0 groupings_list = [] for factor in self.factors: groupings = self.util.partition_factor_matrix(factor, no_of_partitions, self.get_factor_names(i)) groupings_list.append(groupings) i += 1 return groupings_list def print_partitioned_entities(self, no_of_partitions): """ Pretty print groupings :param no_of_partitions: """ groupings_list = self.get_partitions(no_of_partitions) i = 0 for groupings in groupings_list: print(self.print_list[i]) self.util.print_partitioned_entities(groupings) i += 1
class Util(object): """ Class containing all the common utilities used across the entire code base """ def __init__(self): self.conf = ParseConfig() self.data_set_loc = self.conf.config_section_mapper("filePath").get( "data_set_loc") self.data_extractor = DataExtractor(self.data_set_loc) self.mlmovies = self.data_extractor.get_mlmovies_data() self.genre_tag = GenreTag() self.genre_data = self.genre_tag.get_genre_data() def get_movie_id(self, movie): """ Obtain name ID for the name passed as input :param movie: :return: movie id """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['moviename'] == movie] movie_id = movie_data['movieid'].unique() return movie_id[0] def CPDecomposition(self, tensor, rank): """ Perform CP Decomposition :param tensor: :param rank: :return: factor matrices obtained after decomposition """ (movie_count, genre_count, tag_count) = tensor.shape rank = min(rank, movie_count - 1, genre_count - 1, tag_count - 1) factors = decomp.parafac(tensor, rank) return factors def SVD(self, matrix): """ Perform SVD :param matrix: :return: factor matrices and the core matrix """ U, s, Vh = numpy.linalg.svd(matrix, full_matrices=False) return U, s, Vh def PCA(self, matrix): """ Perform PCA :param matrix: :return: factor matrices and the core matrix """ cov_df = numpy.cov(matrix, rowvar=False) U, s, Vh = numpy.linalg.svd(cov_df) return U, s, Vh def LDA(self, input_compound_list, num_topics, num_features): """ Perform LDA :param input_compound_list: :param num_topics: :param num_features: :return: topics and object topic distribution """ dictionary = gensim.corpora.Dictionary(input_compound_list) corpus = [dictionary.doc2bow(text) for text in input_compound_list] lda = gensim.models.ldamodel.LdaModel(corpus, num_topics, id2word=dictionary, passes=20) latent_semantics = lda.print_topics(num_topics, num_features) corpus = lda[corpus] return corpus, latent_semantics def get_doc_topic_matrix(self, u, num_docs, num_topics): """ Reconstructing data :param u: :param num_docs: :param num_topics: :return: reconstructed data """ u_matrix = numpy.zeros(shape=(num_docs, num_topics)) for i in range(0, len(u)): doc = u[i] for j in range(0, len(doc)): (topic_no, prob) = doc[j] u_matrix[i, topic_no] = prob return u_matrix def get_transition_dataframe(self, data_frame): """ Function to get the transition matrix for Random walk :param data_frame: :return: transition matrix """ for column in data_frame: data_frame[column] = pd.Series([ 0 if ind == int(column) else each for ind, each in zip(data_frame.index, data_frame[column]) ], index=data_frame.index) data_frame["row_sum"] = data_frame.sum(axis=1) for column in data_frame: data_frame[column] = pd.Series([ each / sum if (column != "row_sum" and each > 0 and ind != int(column) and sum != 0) else each for ind, each, sum in zip( data_frame.index, data_frame[column], data_frame.row_sum) ], index=data_frame.index) data_frame = data_frame.drop(["row_sum"], axis=1) data_frame.loc[(data_frame.T == 0).all()] = float( 1 / (len(data_frame.columns))) data_frame = data_frame.transpose() return data_frame def get_seed_matrix(self, transition_df, seed_nodes, nodes): """ Function to get the Restart matrix for entries in the seed list :param transition_df: :param seed_nodes: :param nodeids: :return: seed_matrix """ seed_matrix = [0.0 for each in range(len(transition_df.columns))] seed_value_list = self.distribute(seed_nodes, num_of_seeds_to_recommend=1) for each in seed_nodes: seed_matrix[list(nodes).index(each)] = seed_value_list[list( seed_nodes).index(each)] return seed_matrix def compute_pagerank(self, seed_nodes, node_matrix, nodes): """ Function to compute the Personalised Pagerank for the given input :param seed_actors: :param actor_matrix: :param actorids: :return: """ data_frame = pd.DataFrame(node_matrix) transition_df = self.get_transition_dataframe(data_frame) seed_matrix = self.get_seed_matrix(transition_df, seed_nodes, nodes) result_list = seed_matrix temp_list = [] num_of_iter = 0 while temp_list != result_list and num_of_iter <= 1000: num_of_iter += 1 temp_list = result_list result_list = list(0.85 * numpy.matmul( numpy.array(transition_df.values), numpy.array(result_list)) + 0.15 * numpy.array(seed_matrix)) page_rank_dict = {i: j for i, j in zip(nodes, result_list)} sorted_rank = sorted(page_rank_dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_rank[0:len(seed_nodes) + 5] def print_movie_recommendations_and_collect_feedback( self, movie_ids, task_no, user_id): """ Interface to obtain relevance feedback :param movie_ids: List of movies :param task_no: Task from which the interface is called :param user_id: user for which the movies are displayed """ if len(movie_ids) == 0: print("No movies found.") exit(1) if task_no in [1, 2]: print("Movie recommendations: ") elif task_no in [3, 4]: print("Nearest movies: ") else: print("Incorrect task number - " + task_no + "\nAborting...") exit(1) count = 1 movie_dict = {} for movie_id in movie_ids: movie_name = self.get_movie_name_for_id(movie_id) print(str(count) + ". " + str(movie_name) + " - " + str(movie_id)) movie_dict[count] = (movie_name, movie_id) count += 1 done = False rel_movies = [] irrel_movies = [] while not done: movies_list = input( "\nPlease enter comma separated ids of the relevant movies: ") rel_ids = set( movies_list.strip(" ").strip(",").replace(" ", "").split(",")) while '' in rel_ids: rel_ids.remove('') incorrect = False for item in rel_ids: if int(item) not in [ num for num in range(1, len(movie_ids) + 1) ]: print("Incorrect movie ID selected.") incorrect = True break if incorrect: continue confirmation = input( "Are you sure these are the relevant movies? " + str(list(rel_ids)) + " (y/Y/n/N): ") if confirmation != "y" and confirmation != "Y": continue movies_list = input( "\nPlease enter comma separated ids of the irrelevant movies: " ) irrel_ids = set( movies_list.strip(" ").strip(",").replace(" ", "").split(",")) while '' in irrel_ids: irrel_ids.remove('') incorrect = False for item in irrel_ids: if int(item) not in list( set(list([num for num in range(1, len(movie_ids) + 1)])) - set(int(num) for num in rel_ids)): print("Incorrect movie ID selected.") incorrect = True break if incorrect: continue confirmation = input( "Are you sure these are the irrelevant movies? " + str(list(irrel_ids)) + " (y/Y/n/N): ") if confirmation != "y" and confirmation != "Y": continue done = True for item in rel_ids: rel_movies.append(movie_dict[int(item)]) for item in irrel_ids: irrel_movies.append(movie_dict[int(item)]) if task_no == 1 or task_no == 2: if not os.path.isfile(self.data_set_loc + "/task2-feedback.csv"): df = pd.DataFrame( columns=['movie-name', 'movie-id', 'relevancy', 'user-id']) else: df = self.data_extractor.get_task2_feedback_data() for movie in rel_movies: df = df.append( { 'movie-name': movie[0], 'movie-id': movie[1], 'relevancy': 'relevant', 'user-id': user_id }, ignore_index=True) for movie in irrel_movies: df = df.append( { 'movie-name': movie[0], 'movie-id': movie[1], 'relevancy': 'irrelevant', 'user-id': user_id }, ignore_index=True) df.to_csv(self.data_set_loc + "/task2-feedback.csv", index=False) elif task_no == 3 or task_no == 4: if not os.path.isfile(self.data_set_loc + "/task4-feedback.csv"): df = pd.DataFrame( columns=['movie-name', 'movie-id', 'relevancy']) else: df = self.data_extractor.get_task4_feedback_data() for movie in rel_movies: df = df.append( { 'movie-name': movie[0], 'movie-id': movie[1], 'relevancy': 'relevant' }, ignore_index=True) for movie in irrel_movies: df = df.append( { 'movie-name': movie[0], 'movie-id': movie[1], 'relevancy': 'irrelevant' }, ignore_index=True) df.to_csv(self.data_set_loc + "/task4-feedback.csv", index=False) def get_distribution_count(self, seed_nodes, num_of_seeds_to_recommend): """ Given the number of seeds to be recommended and the seed_nodes, returns the distribution for each seed_node considering order :param seed_nodes: :param num_of_seeds_to_recommend: :return: distribution_list """ seed_value_list = self.distribute(seed_nodes, num_of_seeds_to_recommend) seed_value_list = [round(each) for each in seed_value_list] total_count = sum(seed_value_list) difference = num_of_seeds_to_recommend - total_count if difference > 0: for i in range(0, len(seed_value_list)): if seed_value_list[i] == 0: seed_value_list[i] = 1 difference -= 1 if difference == 0: return seed_value_list for i in range(0, len(seed_value_list)): seed_value_list[i] += 1 difference -= 1 if difference == 0: return seed_value_list elif difference < 0: for i in range(0, len(seed_value_list)): if seed_value_list[len(seed_value_list) - 1 - i] != 0: seed_value_list[len(seed_value_list) - 1 - i] -= 1 difference += 1 if difference == 0: return seed_value_list return seed_value_list def get_movie_tag_matrix(self): """ Function to get movie_tag matrix containing list of tags in each movie :return: movie_tag_matrix """ tag_df = self.genre_data unique_tags = tag_df.tag_string.unique() idf_data = tag_df.groupby(['movieid'])['tag_string'].apply(set) tf_df = tag_df.groupby(['movieid' ])['tag_string'].apply(list).reset_index() movie_tag_dict = dict(zip(tf_df.movieid, tf_df.tag_string)) tf_weight_dict = { movie: self.genre_tag.assign_tf_weight(tags) for movie, tags in list(movie_tag_dict.items()) } idf_weight_dict = self.genre_tag.assign_idf_weight( idf_data, unique_tags) tag_df = self.genre_tag.get_model_weight(tf_weight_dict, idf_weight_dict, tag_df, 'tfidf') tag_df["total"] = tag_df.groupby(['movieid', 'tag_string' ])['value'].transform('sum') temp_df = tag_df[["movieid", "tag_string", "total"]].drop_duplicates().reset_index() genre_tag_tfidf_df = temp_df.pivot_table('total', 'movieid', 'tag_string') genre_tag_tfidf_df = genre_tag_tfidf_df.fillna(0) return genre_tag_tfidf_df def distribute(self, seed_nodes, num_of_seeds_to_recommend): """ Distributes importance among seed_nodes based on order of occurrence :param seed_nodes: :param num_of_seeds_to_recommend: :return: list of size num_of_seeds_to_recommend with distributed values """ seed_value = float(num_of_seeds_to_recommend) / len(seed_nodes) seed_value_list = [seed_value for seed in seed_nodes] delta = seed_value / len(seed_nodes) for i in range(0, len(seed_nodes) - 1): seed_value_list[i] = seed_value_list[i] + (len(seed_nodes) - 1 - i) * delta for j in range(i + 1, len(seed_nodes)): seed_value_list[j] = seed_value_list[j] - delta return seed_value_list def get_movie_name_for_id(self, movieid): """ movie name for movie id :param movieid: :return: movie name """ all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['movieid'] == movieid] movie_name = movie_data['moviename'].unique() return movie_name[0] def get_tag_list_for_movie(self, movie): """ Get a tag list for the movie :param movie: movie id :return: list of tags """ movie_specific_data = self.genre_data[self.genre_data["movieid"] == movie] tags_list = movie_specific_data["tag_string"].unique() return tags_list def get_movies_for_tag(self, tag): """ Get the list of movies containing the tag :param tag: tag string :return: list of movies """ tag_specific_data = self.genre_data[self.genre_data["tag_string"] == tag] movies_list = tag_specific_data["movieid"].unique() return movies_list def get_all_movies_for_user(self, user_id): """ Obtain all movies watched by the user :param user_id: :return: list of movies watched by the user """ user_data = self.genre_data[self.genre_data['userid'] == user_id] user_data = user_data.sort_values('timestamp', ascending=False) movies = user_data['movieid'].unique() return movies def get_movies_after_year(self, year): all_movie_data = self.mlmovies movie_data = all_movie_data[all_movie_data['year'] >= year] movie_id_list = movie_data['movieid'].unique() return movie_id_list def get_vector_magnitude(self, vector): """ Calculate the magnitude of the vector :param vector: :return: length of the vector """ result = 0 for i in vector: result += (i * i) return math.sqrt(result)