def buildDF():
    movieCount = movie_tag_map.keys()._len_()
    createDictionaries1()

    tagList = sorted(list(tag_movie_map.keys()))
    dfList = []
    movieList = []
    all_movie_sorted = sorted(list(movie_tag_map.keys()))

    for movie in all_movie_sorted:
        tagsInMovie = movie_tag_map[movie]
        tf_idf_map = dict()
        if tagsInMovie:
            movieList.append(movie)
            for tag in tagList:
                moviesInTagCount = len(tag_movie_map[tag])
                tf_numerator = 0
                for temp_movie, datetime in tag_movie_map[tag]:
                    if movie == temp_movie:
                        tf_numerator += formatter.normalizer(
                            min_date, max_date, datetime)
                tf = tf_numerator / len(tagsInMovie)
                tf_idf = tf * math.log2(movieCount / moviesInTagCount)
                tf_idf_map[tag] = tf_idf
            dfList.append(tf_idf_map)
    return dfList, tagList, movieList
def load_genre_matrix(given_genre):
    movieCount = movie_tag_map.keys().__len__()
    createDictionaries1()

    tagList = sorted(list(tag_movie_map.keys()))
    movieList = []
    df = pd.DataFrame(columns=tagList)
    for movie in genre_movie_map[given_genre]:
        tagsInMovie = movie_tag_map[movie]
        tf_idf_map = dict()
        if tagsInMovie:
            movieList.append(movie)
            for tag in tagList:
                moviesInTagCount = len(tag_movie_map[tag])
                tf_numerator = 0
                for temp_movie, datetime in tag_movie_map[tag]:
                    if movie == temp_movie:
                        tf_numerator += formatter.normalizer(
                            min_date, max_date, datetime)
                tf = tf_numerator / len(tagsInMovie)
                tf_idf = tf * math.log2(movieCount / moviesInTagCount)
                tf_idf_map[tag] = tf_idf
            df = df.append(tf_idf_map, ignore_index=True)
    df.index = movieList
    return df
def load_genre_actor_matrix(given_genre):
    global max_rank
    global min_rank
    global tag_count
    global max_date
    global min_date

    createDictionaries1()

    actorList = sorted(list(actor_movie_rank_map.keys()))
    df = pd.DataFrame(columns=actorList)
    movieCount = movie_tag_map.keys().__len__()
    movieList = []

    for movieInGenre in genre_movie_map[given_genre]:
        movieList.append(movieInGenre)
        actorsInMovieList = movie_actor_rank_map[movieInGenre]
        actorCountOfMovie = len(actorsInMovieList)
        tf_idf_map = dict.fromkeys(actorList, 0.0)
        for actor, rank in actorsInMovieList:
            movieCountOfActor = len(actor_movie_rank_map[actor])
            tf_numerator = (1 / formatter.normalizer(min_rank, max_rank, rank))
            tf_idf = (tf_numerator / actorCountOfMovie) * math.log2(
                movieCount / movieCountOfActor)
            tf_idf_map[actor] = tf_idf
        df = df.append(tf_idf_map, ignore_index=True)
    df.index = movieList
    return df
def actor_tagVector():
    global max_rank
    global min_rank

    for row in movie_actor_df.itertuples():
        if row.actor_movie_rank < min_rank:
            min_rank = row.actor_movie_rank
        if row.actor_movie_rank > max_rank:
            max_rank = row.actor_movie_rank
        actor_movie_rank_map[row.actorid].add(
            (row.movieid, row.actor_movie_rank))
        movie_actor_rank_map[row.movieid].add(
            (row.actorid, row.actor_movie_rank))

    total_actor_count = len(actor_movie_rank_map)
    for actorID, movies_list in actor_movie_rank_map.items():

        tag_counter = 0
        tag_weight_tuple_tf = defaultdict(float)
        tag_weight_tuple_tf_idf = defaultdict(float)
        for movie in movies_list:
            tag_counter += len(movie_tag_map[movie[0]])

        for movieID, rank in movies_list:
            if movieID in movie_tag_map:
                for tag_id, timestamp in movie_tag_map[movieID]:
                    actor_count = 0
                    aSetOfTags = set()
                    for mov in tag_movie_map[tag_id]:
                        aSetOfTags.update(
                            [k for (k, v) in movie_actor_rank_map[mov[0]]])
                    actor_count = aSetOfTags.__len__()
                    tf = (formatter.normalizer(min_date, max_date, timestamp) /
                          formatter.normalizer(min_rank, max_rank,
                                               rank)) / tag_counter
                    tag_weight_tuple_tf[tag_id] += tf
                    tag_weight_tuple_tf_idf[tag_id] += tf * math.log2(
                        total_actor_count / actor_count)
        actor_weight_vector_tf_idf[actorID] = [
            (k, v) for k, v in tag_weight_tuple_tf_idf.items()
        ]

    return actor_weight_vector_tf_idf
Example #5
0
def runAllMethods(userid):
    global sem_matrix_list
    global q_vectorList

    functions = [loadPCASemantics, loadSVDSemantics, loadCPSemantics]
    allSimilarities = []
    for i in range(1, 6):
        similarity = list()
        if (i <= 3):
            similarity_semantic_matrix = functions[i - 1]()
            similarity_semantic_matrix = ((similarity_semantic_matrix - similarity_semantic_matrix.min(axis=0) + 0.00001) \
                            / (similarity_semantic_matrix.max(axis=0) - similarity_semantic_matrix.min(axis=0) + 0.00001))

            vector = np.take(similarity_semantic_matrix, indx, axis=0)
            q_vector = vector.astype(np.float32)

            aug_sim_matx = np.delete(similarity_semantic_matrix, indx,
                                     axis=0).astype(np.float32)
            sem_matrix_list.append(aug_sim_matx)
            q_vectorList.append(q_vector)

            distance = []
            for v in q_vector:
                distance.append(euclideanMatrixVector(aug_sim_matx, v))

            distance = np.array(distance)
            distance = (distance - distance.min() + 0.00001) / (
                (distance.max() - distance.min() + 0.00001))
            similarity = 1. / distance + 0.00001
            similarity = list(
                similarity.T.dot(finalWeights).astype(np.float32))
        if i == 4:
            movie_movie_similarity_subset_new = runLDADecomposition(
                userid)  #update
            similarity = list(
                movie_movie_similarity_subset_new.T.dot(finalWeights).astype(
                    np.float32))
        if i == 5:
            movieRatedSeed = list(
                zip(moviesWatched,
                    finalWeights))  #DataHandler.userMovieOrders(userId)
            P = DataHandler.load_movie_tag_df(
            )  #DataHandler.load_movie_tag_df()
            moviesList = sorted(list(DataHandler.movie_actor_rank_map.keys()))
            euclidean_distance = pairwise.euclidean_distances(P)
            epsilon = np.matrix(np.zeros(euclidean_distance.shape) + 0.000001)
            movie_movie_similarity = 1 / (epsilon + euclidean_distance)
            movie_movie_similarity = pd.DataFrame(movie_movie_similarity)
            prData = ppr.personalizedPageRankWeighted(movie_movie_similarity,
                                                      movieRatedSeed, 0.9)
            moviesNotWATCHED = list(set(moviesList) - set(moviesWatched))
            moviesNotWATCHED_indices = [
                moviesList.index(i) for i in moviesNotWATCHED
            ]
            similarity = list(prData.loc[moviesNotWATCHED_indices, ][0])
        similarity = [
            formatter.normalizer(min(similarity), max(similarity), value)
            for value in similarity
        ]

        allSimilarities.append(similarity)

    similarities = np.array(allSimilarities).mean(axis=0)

    return np.argsort(similarities)[::-1], np.sort(similarities)[::-1]