def genre_spaceActors_LDA_tf(genre):
    DataHandler.vectors()
    DataHandler.createDictionaries1()

    movie_tag_map, tag_id_map, actor_movie_rank_map, movie_actor_rank_map = DataHandler.get_dicts(
    )
    DataHandler.create_actor_actorid_map()
    actor_actorid_map = DataHandler.actor_actorid_map
    df = DataHandler.load_genre_actor_matrix_tf(genre)

    gmMap = DataHandler.genre_movie_map
    if (genre not in list(gmMap.keys())):
        print("genre " + genre + " not in data")
        return

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        df, 4, constants.genreActorSpacePasses)
    topic_terms = defaultdict(set)
    for i in range(0, 4):
        for tuples in ldaModel.get_topic_terms(
                i, topn=len(actor_actorid_map)
        ):  #get_topics_terms returns top n(default = 10) words of the topics
            term = id_Term_map.get(tuples[0])
            topic_terms[i].add((actor_actorid_map.get(term), tuples[1]))
    for i in range(0, 4):
        print('Semantic ' + str(i + 1) + ' ' +
              str(sorted(topic_terms.get(i), key=itemgetter(1), reverse=True)))
        print('\n')
def top10_Actors_LDA(givenActor):
    DataHandler.create_actor_actorid_map()
    top10SimilarActors_similarity = DataHandler.similarActors_LDA(givenActor)
    print('Actors similar to ' +
          str(DataHandler.actor_actorid_map[givenActor]))
    for actor, sim in top10SimilarActors_similarity:
        print(DataHandler.actor_actorid_map[actor] + ' ' + str(sim))
def task1c_tfidf(actor_id):
    DataHandler.vectors()
    actorTagDataframe = DataHandler.actor_tag_df()
    actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist()
    actorIndexList = list(actorTagDataframe.index)

    if (actor_id not in actorIndexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    
    simAndActor = []
    concernedActor = actorsTags[actorIndexList.index(actor_id)]
    totalActors = len(actorIndexList)
    DataHandler.create_actor_actorid_map()
    
    for index in range(0, totalActors):
        comparisonActorId = actorIndexList[index]
        if(actor_id == comparisonActorId):
            continue
        comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        comparisonActor = actorsTags[index]
        comparisonScore = metrics.l2Norm(concernedActor, comparisonActor)
        simAndActor.append((comparisonScore, comparisonActorName))
        
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    
    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def task1c_pca(actor_id):
    DataHandler.vectors()
    actorTagDataframe = DataHandler.actor_tag_df()
    actorTagMatrix = np.matrix(DataHandler.actor_tag_df().as_matrix())
    
    actorIndexList = list(actorTagDataframe.index)
    if (actor_id not in actorIndexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    
    components = decompositions.PCADecomposition(actorTagDataframe, 5)
    
    #using transpose since according to page 158, p inverse = p transpose
    pMatrix = np.matrix(components).transpose()
    actorsInSemantics = (actorTagMatrix * pMatrix).tolist()
    
    simAndActor = [] 
    concernedActorInSemantics = actorsInSemantics[actorIndexList.index(actor_id)] 
    DataHandler.create_actor_actorid_map()
    
    for index in range(0, len(actorsInSemantics)):
        comparisonActorId = actorIndexList[index]
        if (actor_id == comparisonActorId):
            continue
        comparisonActorSemantics = actorsInSemantics[index]
        comparisonActorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        simAndActor.append((metrics.l2Norm(concernedActorInSemantics, comparisonActorSemantics), comparisonActorName))
    
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    
    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def actor_task1c_SVD(actor_id):
    DataHandler.vectors()
    acdf = DataHandler.actor_tag_df()
    indexList=list(acdf.index)
    if (actor_id not in indexList):
        print("Invalid actor id or no tags present for actor. Returning")
        return
    U, Sigma, VT = decompositions.SVDDecomposition(acdf, 5)
    
    simAndActor = []
    actorInSemantics = U[indexList.index(actor_id)]
    DataHandler.create_actor_actorid_map()
    for index in range(0, len(U)):
        comparisonActorId = indexList[index]
        if (comparisonActorId == actor_id):
            continue
        actorName = DataHandler.actor_actorid_map.get(comparisonActorId)
        similarityScore = metrics.l2Norm(actorInSemantics, U[index])
        simAndActor.append((similarityScore, actorName))
    
    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)
    print("Top 10 Actors similar to " + str(DataHandler.actor_actorid_map.get(actor_id)) + " are:")
    top10Actors = result[0:10]
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
def top10_Actors_LDA_tf(givenActor):
    DataHandler.createDictionaries1()
    actor_movie_rank_map = DataHandler.actor_movie_rank_map
    if givenActor not in actor_movie_rank_map:
        print('Invalid seed actor id : '+str(givenActor))
        return
    DataHandler.create_actor_actorid_map()
    top10SimilarActors_similarity = DataHandler.similarActors_LDA_tf(givenActor)
    print('Actors similar to '+str(DataHandler.actor_actorid_map[givenActor]))
    for actor,sim in top10SimilarActors_similarity:
        print(DataHandler.actor_actorid_map[actor]+' '+str(sim))
    return
Ejemplo n.º 7
0
def task1dImplementation_SVD(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    moviesIndexList = list(movie_tag_df.index)
    actorsIndexList = list(actor_tag_df.index)
    actorsSize = len(actorsIndexList)

    if (movie_id not in moviesIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorU, actorSigma, actorV = decompositions.SVDDecomposition(
        actor_tag_df, 5)

    tagsToActorSemantics = (np.matrix(actorV)).transpose()
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())
    movieInTags = movieTagMatrix[moviesIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * tagsToActorSemantics).tolist()[0]
    actorsInSemantics = np.matrix(actorU)

    actorsWithScores = []

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    for index in range(0, actorsSize):
        actor_id = actorsIndexList[index]
        if actor_id in actorsForMovie:
            continue
        actorMatrix = actorsInSemantics[index]
        actor = (actorMatrix.tolist())[0]
        actorName = DataHandler.actor_actorid_map.get(actor_id)
        similarityScore = metrics.l2Norm(actor, movieInActorSemantics)
        actorsWithScores.append((similarityScore, actorName))

    resultActors = sorted(actorsWithScores,
                          key=operator.itemgetter(0),
                          reverse=False)
    top10Actors = resultActors[0:10]
    print("10 Actors similar to movie " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Ejemplo n.º 8
0
def task1d_pca(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    movieid_name_map = DataHandler.movieid_name_map

    actor_tag_df = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()

    actorTagMatrix = np.matrix(actor_tag_df.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    actorIndexList = list(actor_tag_df.index)
    movieIndexList = list(movie_tag_df.index)
    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorSemantics = decompositions.PCADecomposition(actor_tag_df, 5)

    actorP = np.matrix(actorSemantics).transpose()
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)]
    movieInActorSemantics = (movieInTags * actorP).tolist()[0]
    actorsInActorSemantics = (actorTagMatrix * actorP).tolist()

    DataHandler.create_actor_actorid_map()
    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)

    DataHandler.create_actor_actorid_map()
    actorsSize = len(actorsInActorSemantics)
    simAndActor = []
    for index in range(0, actorsSize):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorInSemantics = actorsInActorSemantics[index]
        actorName = DataHandler.actor_actorid_map.get(actorId)
        score = metrics.l2Norm(actorInSemantics, movieInActorSemantics)
        simAndActor.append((score, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    print("Top 10 actors similar to movie: " +
          str(movieid_name_map.get(movie_id)) + " are: ")
    top10Actors = result[0:10]
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Ejemplo n.º 9
0
def PersnalizedPageRank_top10_SimilarCoActors(seed):
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    coactcoact, ignoreVariable = DataHandler.coactor_siilarity_matrix()
    actor_actorid_map = DataHandler.actor_actorid_map
    alpha = constants.ALPHA
    act_similarities = ppr.personalizedPageRank(coactcoact,seed,alpha)
    actors = list(coactcoact.index)
    actorDF = pd.DataFrame(pd.Series(actors),columns = ['Actor'])
    actorDF['Actor'] = actorDF['Actor'].map(lambda x:actor_actorid_map.get(x))
    Result = pd.concat([act_similarities,actorDF],axis = 1)
    sortedResult=Result.sort_values(by=0,ascending=False).head(15)
    seedAcotorNames = [actor_actorid_map.get(i) for i in seed]
    print('Co Actors similar to the following seed actors: '+str(seedAcotorNames))
    for index in sortedResult.index:
        if sortedResult.loc[index,'Actor'] not in seedAcotorNames:
            print(sortedResult.loc[index,'Actor']+' '+ str(sortedResult.loc[index,0]))
Ejemplo n.º 10
0
def task1d_tfidf(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    actorTagDataframe = DataHandler.actor_tag_df()
    movie_tag_df = DataHandler.load_movie_tag_df()
    movieid_name_map = DataHandler.movieid_name_map

    actorsTags = np.matrix(actorTagDataframe.as_matrix()).tolist()
    actorIndexList = list(actorTagDataframe.index)
    movieIndexList = list(movie_tag_df.index)
    movieTagMatrix = np.matrix(movie_tag_df.as_matrix())

    if (movie_id not in movieIndexList):
        print("Movie " + movieid_name_map.get(movie_id) +
              " not present in mltags data. Quitting")
        return

    actorsForMovie = DataHandler.movie_actor_map.get(movie_id)
    simAndActor = []
    movieInTags = movieTagMatrix[movieIndexList.index(movie_id)].tolist()[0]
    totalActors = len(actorIndexList)
    DataHandler.create_actor_actorid_map()

    for index in range(0, totalActors):
        actorId = actorIndexList[index]
        if (actorId in actorsForMovie):
            continue
        actorName = DataHandler.actor_actorid_map.get(actorId)
        actorinTags = actorsTags[index]
        comparisonScore = metrics.l2Norm(movieInTags, actorinTags)
        simAndActor.append((comparisonScore, actorName))

    result = sorted(simAndActor, key=operator.itemgetter(0), reverse=False)

    top10Actors = result[0:10]
    print("Top 10 actors similar to " + str(movieid_name_map.get(movie_id)) +
          " are: ")
    for tup in top10Actors:
        print(tup[1] + " : " + str(tup[0]))
    return
Ejemplo n.º 11
0
def similarMovieActor_LDA(givenMovie):
    DataHandler.vectors()
    DataHandler.createDictionaries1()
    DataHandler.create_actor_actorid_map()
    givenActor_similarity = defaultdict(float)
    actor_tag_dff = DataHandler.actor_tag_df()
    movie_tag_dff = DataHandler.load_movie_tag_df()
    actorTagMatrix = np.matrix(actor_tag_dff.as_matrix())
    movieTagMatrix = np.matrix(movie_tag_dff.as_matrix())
    movieid_name_map = DataHandler.movieid_name_map

    actorIndexList = list(actor_tag_dff.index)
    movieIndexList = list(movie_tag_dff.index)

    if (givenMovie not in movieIndexList):
        print("Movie " + movieid_name_map.get(givenMovie) +
              " not present in mltags data. Quitting")
        return
    movieInTags = movieTagMatrix[movieIndexList.index(givenMovie)]
    actorsForMovie = DataHandler.movie_actor_map.get(givenMovie)

    ldaModel, doc_term_matrix, id_Term_map = decompositions.LDADecomposition(
        actor_tag_dff, 5, constants.actorTagsSpacePasses)
    for otherActor in actorIndexList:
        mo1 = DataHandler.representDocInLDATopics(movie_tag_dff, givenMovie,
                                                  ldaModel)
        if otherActor not in actorsForMovie:
            ac2 = DataHandler.representDocInLDATopics(actor_tag_dff,
                                                      otherActor, ldaModel)
            givenActor_similarity[otherActor] = (
                metrics.simlarity_kullback_leibler(mo1, ac2))
    #print(sorted(givenActor_similarity.items(),key = itemgetter(1),reverse=True))
    top10 = sorted(givenActor_similarity.items(),
                   key=itemgetter(1),
                   reverse=False)[0:11]
    for actors in top10:
        print(DataHandler.actor_actorid_map.get(actors[0]), actors[1])
    return
Ejemplo n.º 12
0
# -*- coding: utf-8 -*-
from computations import decompositions
from data import DataHandler
from collections import defaultdict
from operator import itemgetter
from util import constants
import numpy as np
import operator
from computations import metrics

DataHandler.vectors()
DataHandler.create_actor_actorid_map()


def task1d(movie_id, method):
    if (method == "SVD"):
        task1dImplementation_SVD(movie_id)
    elif (method == "PCA"):
        task1d_pca(movie_id)
    elif (method == "LDA"):
        similarMovieActor_LDA(movie_id)
    elif (method == "TFIDF"):
        task1d_tfidf(movie_id)
    else:
        print("Invalid method. Please use SVD or PCA or LDA or TFIDF")
    return


def task1dImplementation_SVD(movie_id):
    DataHandler.vectors()
    DataHandler.createDictionaries1()