def similarActors_LDA_tf(givenActor):
    createDictionaries1()
    vectors()
    givenActor_similarity = defaultdict(float)
    actor_weight_vector_tf = actor_tagVector_tf()
    tagList = sorted(list(tag_movie_map.keys()))
    actorList = sorted(list(actor_movie_rank_map.keys()))
    df = pd.DataFrame(columns=tagList)
    dictList = []
    for actor in actorList:
        actor_tag_dict = dict.fromkeys(tagList, 0.0)
        for tag, weight in actor_weight_vector_tf[actor]:
            actor_tag_dict[tag] = weight
        dictList.append(actor_tag_dict)
    df = df.append(dictList, ignore_index=True)
    t = time.time()
    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        df, 4, constants.actorTagsSpacePasses)
    print('Query : ', time.time() - t)
    for otherActor in actorList:
        ac1 = representDocInLDATopics(df, actorList.index(givenActor),
                                      ldaModel)
        if otherActor != givenActor:
            ac2 = representDocInLDATopics(df, actorList.index(otherActor),
                                          ldaModel)
            givenActor_similarity[otherActor] = (
                metrics.simlarity_kullback_leibler(ac1, ac2))
    #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True))
    top10 = sorted(givenActor_similarity.items(),
                   key=itemgetter(1),
                   reverse=False)[0:11]
    return top10
def buildDF_LDASpace():
    df = load_movie_tag_tf_df()
    ldaModel, doc_term_matrix, id_Term_map = None, None, None
    try:
        ldaModel = pickle.load(
            open(constants.DIRECTORY + "ldaModel.pickle", "rb"))
        doc_term_matrix, id_Term_map = pickle.load(
            open(constants.DIRECTORY + "doc_term_matrix.pickle",
                 "rb")), pickle.load(
                     open(constants.DIRECTORY + "id_Term_map.pickle", "rb"))
    except (OSError, IOError) as e:
        #        print("in load_movie_LDASpace_df")
        ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
            df, 50, constants.genreTagsSpacePasses)
        pickle.dump(ldaModel,
                    open(constants.DIRECTORY + "ldaModel.pickle", "wb"))
        pickle.dump(doc_term_matrix,
                    open(constants.DIRECTORY + "doc_term_matrix.pickle", "wb"))
        pickle.dump(id_Term_map,
                    open(constants.DIRECTORY + "id_Term_map.pickle", "wb"))
    moviel = list(df.index)
    dfList = list()
    for mid in moviel:
        latentSpace = [0] * 50
        for tup in ldaModel[docSpecificCorpus(df, mid)]:
            index = tup[0]
            prob = tup[1]
            latentSpace[index] = prob
        dfList.append(latentSpace)
    return dfList, moviel
def genre_spaceActors_LDA_tf(genre):
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts(
    )
    DataHandler.create_actor_actorid_map()
    actor_actorid_map = DataHandler.actor_actorid_map
    df = DataHandler.load_genre_actor_matrix_tf(genre)

    gmMap = DataHandler.genre_movie_map
    if (genre not in list(gmMap.keys())):
        print("genre " + genre + " not in data")
        return

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        df, 4, constants.genreActorSpacePasses)
    topic_terms = defaultdict(set)
    for i in range(0, 4):
        for tuples in ldaModel.get_topic_terms(
                i, topn=len(actor_actorid_map)
        ):  #get_topics_terms returns top n(default = 10) words of the topics
            term = id_Term_map.get(tuples[0])
            topic_terms[i].add((actor_actorid_map.get(term), tuples[1]))
    for i in range(0, 4):
        print('Semantic ' + str(i + 1) + ' ' +
              str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True)))
        print('\n')
Beispiel #4
0
def genre_spaceActors_LDA(genre):
    movie_tag_map,tag_id_map,actor_movie_rank_map,movie_actor_rank_map = DataHandler.get_dicts()
    df = DataHandler.load_genre_actor_matrix(genre)
    ldaModel,doc_term_matrix,id_Term_map  =  decompositions.LDADecomposition(df,5,constants.genreActorSpacePasses)
    topic_terms = defaultdict(set)
    for i in range(0,5):
        for tuples in ldaModel.get_topic_terms(i):#get_topics_terms returns top n(default = 10) words of the topics
            term = id_Term_map.get(tuples[0])
            topic_terms[i].add((term,tuples[1]))
    for i in range(0,5):
        print(sorted(topic_terms.get(i),key = itemgetter(1),reverse=True))
        print('\n')
def LDA_SIM(userid):
    #DataHandler.vectors()   #get user's movies

    #DataHandler.createDictionaries1()

    movie_date_List = DataHandler.user_rated_or_tagged_date_map.get(userid)
    movieList = sorted([i[0] for i in movie_date_List])
    movie_tag_df = DataHandler.load_movie_tag_tf_df()
    try:
        ldaModel = pickle.load(
            open(constants.DIRECTORY + "ldaModel.pickle", "rb"))
        doc_term_matrix, id_Term_map = pickle.load(
            open(constants.DIRECTORY + "doc_term_matrix.pickle",
                 "rb")), pickle.load(
                     open(constants.DIRECTORY + "id_Term_map.pickle", "rb"))
    except (OSError, IOError) as e:
        print("1")
        ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
            movie_tag_df, 50, constants.genreTagsSpacePasses)
        pickle.dump(ldaModel,
                    open(constants.DIRECTORY + "ldaModel.pickle", "wb"))
        pickle.dump(doc_term_matrix,
                    open(constants.DIRECTORY + "doc_term_matrix.pickle", "wb"))
        pickle.dump(id_Term_map,
                    open(constants.DIRECTORY + "id_Term_map.pickle", "wb"))

    all_movie_list = sorted(list(movie_tag_df.index.values))
    all_movie_butWatched_list = sorted(
        list(set(all_movie_list) - set(movieList)))

    givenMovie_similarity_DFlist = list()
    for movie in movieList:
        m1 = DataHandler.representDocInLDATopics(movie_tag_df, movie, ldaModel)
        m1_Similarity_list = dict()
        for otherMovies in all_movie_butWatched_list:
            m2 = DataHandler.representDocInLDATopics(movie_tag_df, otherMovies,
                                                     ldaModel)
            m1_Similarity_list[otherMovies] = (
                1 / (metrics.simlarity_kullback_leibler(m1, m2) + 0.00000001))
        givenMovie_similarity_DFlist.append(m1_Similarity_list)
    givenMovie_similarity = pd.DataFrame(givenMovie_similarity_DFlist,
                                         index=movieList,
                                         columns=all_movie_butWatched_list)

    #    movie_year_maps = DataHandler.movie_year_map
    #
    #    for movie,val in movieList:
    #        for otherMovies in all_movie_list:
    #            if ((otherMovies != movie) and movie in all_movie_list):
    #                givenMovie_similarity.set_value(movie,otherMovies,givenMovie_similarity.at[movie,otherMovies]*(movie_year_maps.get(otherMovies)/movie_year_maps.get(movie)))
    #

    return givenMovie_similarity  #getWeightedSimilarityOrder1(givenMovie_similarity,userid,movie_tag_df,movieList)
def genre_spaceTags_LDA_tf(genre):
    movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts(
    )
    genre_movie_map = DataHandler.getGenreMoviesMap()
    if (genre not in genre_movie_map.keys()):
        print("genre " + genre + " not present in data")
        return
    df = DataHandler.load_genre_matrix_tf(genre)
    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        df, 4, constants.genreTagsSpacePasses)
    topic_terms = defaultdict(set)
    for i in range(0, 4):
        for tuples in ldaModel.get_topic_terms(
                i, topn=len(tag_id_map)
        ):  #get_topics_terms returns top n(default = 10) words of the topics
            term = tag_id_map.get(id_Term_map.get(tuples[0]))
            topic_terms[i].add((term, tuples[1]))
    for i in range(0, 4):
        print('Semantic ' + str(i + 1) + ' ' +
              str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True)))
        print('\n')
Beispiel #7
0
def similarMovieActor_LDA(givenMovie):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    givenActor_similarity = defaultdict(float)
    actor_tag_dff = DataHandler.actor_tag_df()
    movie_tag_dff = DataHandler.load_movie_tag_df()
    actorTagMatrix = np.matrix(actor_tag_dff.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_dff.as_matrix())
    movieid_name_map = DataHandler.movieid_name_map

    actorIndexList = list(actor_tag_dff.index)
    movieIndexList = list(movie_tag_dff.index)

    if (givenMovie not in movieIndexList):
        print("Movie " + movieid_name_map.get(givenMovie) +
              " not present in mltags data. Quitting")
        return
    movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)]
    actorsForMovie = DataHandler.movie_actor_map.get(givenMovie)

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        actor_tag_dff, 5, constants.actorTagsSpacePasses)
    for otherActor in actorIndexList:
        mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie,
                                                  ldaModel)
        if otherActor not in actorsForMovie:
            ac2 = DataHandler.representDocInLDATopics(actor_tag_dff,
                                                      otherActor, ldaModel)
            givenActor_similarity[otherActor] = (
                metrics.simlarity_kullback_leibler(mo1, ac2))
    #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True))
    top10 = sorted(givenActor_similarity.items(),
                   key=itemgetter(1),
                   reverse=False)[0:11]
    for actors in top10:
        print(DataHandler.actor_actorid_map.get(actors[0]), actors[1])
    return