def similarActors_LDA_tf(givenActor): createDictionaries1() vectors() givenActor_similarity = defaultdict(float) actor_weight_vector_tf = actor_tagVector_tf() tagList = sorted(list(tag_movie_map.keys())) actorList = sorted(list(actor_movie_rank_map.keys())) df = pd.DataFrame(columns=tagList) dictList = [] for actor in actorList: actor_tag_dict = dict.fromkeys(tagList, 0.0) for tag, weight in actor_weight_vector_tf[actor]: actor_tag_dict[tag] = weight dictList.append(actor_tag_dict) df = df.append(dictList, ignore_index=True) t = time.time() ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 4, constants.actorTagsSpacePasses) print('Query : ', time.time() - t) for otherActor in actorList: ac1 = representDocInLDATopics(df, actorList.index(givenActor), ldaModel) if otherActor != givenActor: ac2 = representDocInLDATopics(df, actorList.index(otherActor), ldaModel) givenActor_similarity[otherActor] = ( metrics.simlarity_kullback_leibler(ac1, ac2)) #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True)) top10 = sorted(givenActor_similarity.items(), key=itemgetter(1), reverse=False)[0:11] return top10
def buildDF_LDASpace(): df = load_movie_tag_tf_df() ldaModel, doc_term_matrix, id_Term_map = None, None, None try: ldaModel = pickle.load( open(constants.DIRECTORY + "ldaModel.pickle", "rb")) doc_term_matrix, id_Term_map = pickle.load( open(constants.DIRECTORY + "doc_term_matrix.pickle", "rb")), pickle.load( open(constants.DIRECTORY + "id_Term_map.pickle", "rb")) except (OSError, IOError) as e: # print("in load_movie_LDASpace_df") ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 50, constants.genreTagsSpacePasses) pickle.dump(ldaModel, open(constants.DIRECTORY + "ldaModel.pickle", "wb")) pickle.dump(doc_term_matrix, open(constants.DIRECTORY + "doc_term_matrix.pickle", "wb")) pickle.dump(id_Term_map, open(constants.DIRECTORY + "id_Term_map.pickle", "wb")) moviel = list(df.index) dfList = list() for mid in moviel: latentSpace = [0] * 50 for tup in ldaModel[docSpecificCorpus(df, mid)]: index = tup[0] prob = tup[1] latentSpace[index] = prob dfList.append(latentSpace) return dfList, moviel
def genre_spaceActors_LDA_tf(genre): DataHandler.vectors() DataHandler.createDictionaries1() movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts( ) DataHandler.create_actor_actorid_map() actor_actorid_map = DataHandler.actor_actorid_map df = DataHandler.load_genre_actor_matrix_tf(genre) gmMap = DataHandler.genre_movie_map if (genre not in list(gmMap.keys())): print("genre " + genre + " not in data") return ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 4, constants.genreActorSpacePasses) topic_terms = defaultdict(set) for i in range(0, 4): for tuples in ldaModel.get_topic_terms( i, topn=len(actor_actorid_map) ): #get_topics_terms returns top n(default = 10) words of the topics term = id_Term_map.get(tuples[0]) topic_terms[i].add((actor_actorid_map.get(term), tuples[1])) for i in range(0, 4): print('Semantic ' + str(i + 1) + ' ' + str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True))) print('\n')
def genre_spaceActors_LDA(genre): movie_tag_map,tag_id_map,actor_movie_rank_map,movie_actor_rank_map = DataHandler.get_dicts() df = DataHandler.load_genre_actor_matrix(genre) ldaModel,doc_term_matrix,id_Term_map = decompositions.LDADecomposition(df,5,constants.genreActorSpacePasses) topic_terms = defaultdict(set) for i in range(0,5): for tuples in ldaModel.get_topic_terms(i):#get_topics_terms returns top n(default = 10) words of the topics term = id_Term_map.get(tuples[0]) topic_terms[i].add((term,tuples[1])) for i in range(0,5): print(sorted(topic_terms.get(i),key = itemgetter(1),reverse=True)) print('\n')
def LDA_SIM(userid): #DataHandler.vectors() #get user's movies #DataHandler.createDictionaries1() movie_date_List = DataHandler.user_rated_or_tagged_date_map.get(userid) movieList = sorted([i[0] for i in movie_date_List]) movie_tag_df = DataHandler.load_movie_tag_tf_df() try: ldaModel = pickle.load( open(constants.DIRECTORY + "ldaModel.pickle", "rb")) doc_term_matrix, id_Term_map = pickle.load( open(constants.DIRECTORY + "doc_term_matrix.pickle", "rb")), pickle.load( open(constants.DIRECTORY + "id_Term_map.pickle", "rb")) except (OSError, IOError) as e: print("1") ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( movie_tag_df, 50, constants.genreTagsSpacePasses) pickle.dump(ldaModel, open(constants.DIRECTORY + "ldaModel.pickle", "wb")) pickle.dump(doc_term_matrix, open(constants.DIRECTORY + "doc_term_matrix.pickle", "wb")) pickle.dump(id_Term_map, open(constants.DIRECTORY + "id_Term_map.pickle", "wb")) all_movie_list = sorted(list(movie_tag_df.index.values)) all_movie_butWatched_list = sorted( list(set(all_movie_list) - set(movieList))) givenMovie_similarity_DFlist = list() for movie in movieList: m1 = DataHandler.representDocInLDATopics(movie_tag_df, movie, ldaModel) m1_Similarity_list = dict() for otherMovies in all_movie_butWatched_list: m2 = DataHandler.representDocInLDATopics(movie_tag_df, otherMovies, ldaModel) m1_Similarity_list[otherMovies] = ( 1 / (metrics.simlarity_kullback_leibler(m1, m2) + 0.00000001)) givenMovie_similarity_DFlist.append(m1_Similarity_list) givenMovie_similarity = pd.DataFrame(givenMovie_similarity_DFlist, index=movieList, columns=all_movie_butWatched_list) # movie_year_maps = DataHandler.movie_year_map # # for movie,val in movieList: # for otherMovies in all_movie_list: # if ((otherMovies != movie) and movie in all_movie_list): # givenMovie_similarity.set_value(movie,otherMovies,givenMovie_similarity.at[movie,otherMovies]*(movie_year_maps.get(otherMovies)/movie_year_maps.get(movie))) # return givenMovie_similarity #getWeightedSimilarityOrder1(givenMovie_similarity,userid,movie_tag_df,movieList)
def genre_spaceTags_LDA_tf(genre): movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts( ) genre_movie_map = DataHandler.getGenreMoviesMap() if (genre not in genre_movie_map.keys()): print("genre " + genre + " not present in data") return df = DataHandler.load_genre_matrix_tf(genre) ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( df, 4, constants.genreTagsSpacePasses) topic_terms = defaultdict(set) for i in range(0, 4): for tuples in ldaModel.get_topic_terms( i, topn=len(tag_id_map) ): #get_topics_terms returns top n(default = 10) words of the topics term = tag_id_map.get(id_Term_map.get(tuples[0])) topic_terms[i].add((term, tuples[1])) for i in range(0, 4): print('Semantic ' + str(i + 1) + ' ' + str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True))) print('\n')
def similarMovieActor_LDA(givenMovie): DataHandler.vectors() DataHandler.createDictionaries1() DataHandler.create_actor_actorid_map() givenActor_similarity = defaultdict(float) actor_tag_dff = DataHandler.actor_tag_df() movie_tag_dff = DataHandler.load_movie_tag_df() actorTagMatrix = np.matrix(actor_tag_dff.as_matrix()) movieTagMatrix = np.matrix(movie_tag_dff.as_matrix()) movieid_name_map = DataHandler.movieid_name_map actorIndexList = list(actor_tag_dff.index) movieIndexList = list(movie_tag_dff.index) if (givenMovie not in movieIndexList): print("Movie " + movieid_name_map.get(givenMovie) + " not present in mltags data. Quitting") return movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)] actorsForMovie = DataHandler.movie_actor_map.get(givenMovie) ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition( actor_tag_dff, 5, constants.actorTagsSpacePasses) for otherActor in actorIndexList: mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie, ldaModel) if otherActor not in actorsForMovie: ac2 = DataHandler.representDocInLDATopics(actor_tag_dff, otherActor, ldaModel) givenActor_similarity[otherActor] = ( metrics.simlarity_kullback_leibler(mo1, ac2)) #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True)) top10 = sorted(givenActor_similarity.items(), key=itemgetter(1), reverse=False)[0:11] for actors in top10: print(DataHandler.actor_actorid_map.get(actors[0]), actors[1]) return